From 09ea0dfbf972c63dfea8cf46f2fb67f39e4d833b Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 29 May 2018 15:59:18 +0200
Subject: dma-buf: make map_atomic and map function pointers optional

So drivers don't need dummy functions just returning NULL.

Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20180529135918.19729-1-kraxel@redhat.com
---
 include/linux/dma-buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 085db2fee2d7..88917fa796e4 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,12 +39,12 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map_atomic: maps a page from the buffer into kernel address
+ * @map_atomic: [optional] maps a page from the buffer into kernel address
  *		space, users may not block until the subsequent unmap call.
  *		This callback must not sleep.
  * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
  *		  This Callback must not sleep.
- * @map: maps a page from the buffer into kernel address space.
+ * @map: [optional] maps a page from the buffer into kernel address space.
  * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
  *	  address space. Same restrictions as for vmap and friends apply.
-- 
cgit v1.2.3


From 2d4f545cb14fa2d9865124183f3100d46dc9b3b5 Mon Sep 17 00:00:00 2001
From: Peter Meerwald <pmeerw@pmeerw.net>
Date: Mon, 4 Jun 2018 23:34:06 +0200
Subject: rfkill: Fixes and cleanup of kernel-doc in the header file

Fixes kerneldoc parameter names to match implementation,
rfkill_set_hw_state(), rfkill_set_sw_state().
Fix description of rfkill_resume_polling().
Fix typos in documentation of rfkill_find_type().
Consistently start kerneldoc description with uppercase
letter.

Signed-off-by: Peter Meerwald-Stadler <pmeerw@pmeerw.net>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/linux/rfkill.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index e6a0031d1b1f..8ad2487a86d5 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -66,7 +66,7 @@ struct rfkill_ops {
 
 #if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
 /**
- * rfkill_alloc - allocate rfkill structure
+ * rfkill_alloc - Allocate rfkill structure
  * @name: name of the struct -- the string is not copied internally
  * @parent: device that has rf switch on it
  * @type: type of the switch (RFKILL_TYPE_*)
@@ -112,7 +112,7 @@ void rfkill_pause_polling(struct rfkill *rfkill);
 /**
  * rfkill_resume_polling(struct rfkill *rfkill)
  *
- * Pause polling -- say transmitter is off for other reasons.
+ * Resume polling
  * NOTE: not necessary for suspend/resume -- in that case the
  * core stops polling anyway
  */
@@ -130,7 +130,7 @@ void rfkill_resume_polling(struct rfkill *rfkill);
 void rfkill_unregister(struct rfkill *rfkill);
 
 /**
- * rfkill_destroy - free rfkill structure
+ * rfkill_destroy - Free rfkill structure
  * @rfkill: rfkill structure to be destroyed
  *
  * Destroys the rfkill structure.
@@ -140,7 +140,7 @@ void rfkill_destroy(struct rfkill *rfkill);
 /**
  * rfkill_set_hw_state - Set the internal rfkill hardware block state
  * @rfkill: pointer to the rfkill class to modify.
- * @state: the current hardware block state to set
+ * @blocked: the current hardware block state to set
  *
  * rfkill drivers that get events when the hard-blocked state changes
  * use this function to notify the rfkill core (and through that also
@@ -161,7 +161,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked);
 /**
  * rfkill_set_sw_state - Set the internal rfkill software block state
  * @rfkill: pointer to the rfkill class to modify.
- * @state: the current software block state to set
+ * @blocked: the current software block state to set
  *
  * rfkill drivers that get events when the soft-blocked state changes
  * (yes, some platforms directly act on input but allow changing again)
@@ -183,7 +183,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked);
 /**
  * rfkill_init_sw_state - Initialize persistent software block state
  * @rfkill: pointer to the rfkill class to modify.
- * @state: the current software block state to set
+ * @blocked: the current software block state to set
  *
  * rfkill drivers that preserve their software block state over power off
  * use this function to notify the rfkill core (and through that also
@@ -208,17 +208,17 @@ void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked);
 void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw);
 
 /**
- * rfkill_blocked - query rfkill block
+ * rfkill_blocked - Query rfkill block state
  *
  * @rfkill: rfkill struct to query
  */
 bool rfkill_blocked(struct rfkill *rfkill);
 
 /**
- * rfkill_find_type - Helpper for finding rfkill type by name
+ * rfkill_find_type - Helper for finding rfkill type by name
  * @name: the name of the type
  *
- * Returns enum rfkill_type that conrresponds the name.
+ * Returns enum rfkill_type that corresponds to the name.
  */
 enum rfkill_type rfkill_find_type(const char *name);
 
@@ -296,7 +296,7 @@ static inline enum rfkill_type rfkill_find_type(const char *name)
 const char *rfkill_get_led_trigger_name(struct rfkill *rfkill);
 
 /**
- * rfkill_set_led_trigger_name -- set the LED trigger name
+ * rfkill_set_led_trigger_name - Set the LED trigger name
  * @rfkill: rfkill struct
  * @name: LED trigger name
  *
-- 
cgit v1.2.3


From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Sat, 9 Jun 2018 09:14:42 +0300
Subject: cfg80211: Add support for HE

Add support for the HE in cfg80211 and also add userspace API to
nl80211 to send rate information out, conforming with P802.11ax_D2.0.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Ido Yariv <idox.yariv@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/linux/ieee80211.h    | 427 +++++++++++++++++++++++++++++++++++++++++++
 include/net/cfg80211.h       | 106 ++++++++++-
 include/uapi/linux/nl80211.h |  87 ++++++++-
 net/wireless/core.c          |  21 ++-
 net/wireless/nl80211.c       |  99 +++++++++-
 net/wireless/util.c          |  82 +++++++++
 6 files changed, 817 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 8fe7e4306816..e6a6503bfa33 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1539,6 +1539,106 @@ struct ieee80211_vht_operation {
 	__le16 basic_mcs_set;
 } __packed;
 
+/**
+ * struct ieee80211_he_cap_elem - HE capabilities element
+ *
+ * This structure is the "HE capabilities element" fixed fields as
+ * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3
+ */
+struct ieee80211_he_cap_elem {
+	u8 mac_cap_info[5];
+	u8 phy_cap_info[9];
+} __packed;
+
+#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN	5
+
+/**
+ * enum ieee80211_he_mcs_support - HE MCS support definitions
+ * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
+ *	number of streams
+ * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
+ * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
+ * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
+ *
+ * These definitions are used in each 2-bit subfield of the rx_mcs_*
+ * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
+ * both split into 8 subfields by number of streams. These values indicate
+ * which MCSes are supported for the number of streams the value appears
+ * for.
+ */
+enum ieee80211_he_mcs_support {
+	IEEE80211_HE_MCS_SUPPORT_0_7	= 0,
+	IEEE80211_HE_MCS_SUPPORT_0_9	= 1,
+	IEEE80211_HE_MCS_SUPPORT_0_11	= 2,
+	IEEE80211_HE_MCS_NOT_SUPPORTED	= 3,
+};
+
+/**
+ * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
+ *
+ * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
+ * described in P802.11ax_D2.0 section 9.4.2.237.4
+ *
+ * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     widths less than 80MHz.
+ * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
+ *     width 160MHz.
+ * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
+ *     channel width 80p80MHz.
+ */
+struct ieee80211_he_mcs_nss_supp {
+	__le16 rx_mcs_80;
+	__le16 tx_mcs_80;
+	__le16 rx_mcs_160;
+	__le16 tx_mcs_160;
+	__le16 rx_mcs_80p80;
+	__le16 tx_mcs_80p80;
+} __packed;
+
+/**
+ * struct ieee80211_he_operation - HE capabilities element
+ *
+ * This structure is the "HE operation element" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.238
+ */
+struct ieee80211_he_operation {
+	__le32 he_oper_params;
+	__le16 he_mcs_nss_set;
+	/* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */
+	u8 optional[0];
+} __packed;
+
+/**
+ * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
+ *
+ * This structure is the "MU AC Parameter Record" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.240
+ */
+struct ieee80211_he_mu_edca_param_ac_rec {
+	u8 aifsn;
+	u8 ecw_min_max;
+	u8 mu_edca_timer;
+} __packed;
+
+/**
+ * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
+ *
+ * This structure is the "MU EDCA Parameter Set element" fields as
+ * described in P802.11ax_D2.0 section 9.4.2.240
+ */
+struct ieee80211_mu_edca_param_set {
+	u8 mu_qos_info;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_be;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
+	struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
+} __packed;
 
 /* 802.11ac VHT Capabilities */
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
@@ -1577,6 +1677,328 @@ struct ieee80211_vht_operation {
 #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN			0x10000000
 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN			0x20000000
 
+/* 802.11ax HE MAC capabilities */
+#define IEEE80211_HE_MAC_CAP0_HTC_HE				0x01
+#define IEEE80211_HE_MAC_CAP0_TWT_REQ				0x02
+#define IEEE80211_HE_MAC_CAP0_TWT_RES				0x04
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP		0x00
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1		0x08
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2		0x10
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3		0x18
+#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK			0x18
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1		0x00
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2		0x20
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4		0x40
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8		0x60
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16		0x80
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32		0xa0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64		0xc0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED	0xe0
+#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK		0xe0
+
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED		0x00
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128			0x01
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256			0x02
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512			0x03
+#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK		0x03
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US		0x00
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US		0x04
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US		0x08
+#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK		0x0c
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1		0x00
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2		0x10
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3		0x20
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4		0x30
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5		0x40
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6		0x50
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7		0x60
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8		0x70
+#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK		0x70
+
+/* Link adaptation is split between byte HE_MAC_CAP1 and
+ * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
+ * in which case the following values apply:
+ * 0 = No feedback.
+ * 1 = reserved.
+ * 2 = Unsolicited feedback.
+ * 3 = both
+ */
+#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION			0x80
+
+#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION			0x01
+#define IEEE80211_HE_MAC_CAP2_ALL_ACK				0x02
+#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED			0x04
+#define IEEE80211_HE_MAC_CAP2_BSR				0x08
+#define IEEE80211_HE_MAC_CAP2_BCAST_TWT				0x10
+#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP			0x20
+#define IEEE80211_HE_MAC_CAP2_MU_CASCADING			0x40
+#define IEEE80211_HE_MAC_CAP2_ACK_EN				0x80
+
+#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU	0x01
+#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL			0x02
+#define IEEE80211_HE_MAC_CAP3_OFDMA_RA				0x04
+
+/* The maximum length of an A-MDPU is defined by the combination of the Maximum
+ * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
+ * same field in the HE capabilities.
+ */
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT	0x00
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1		0x08
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2		0x10
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED	0x18
+#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK		0x18
+#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG			0x20
+#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
+#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
+
+#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
+#define IEEE80211_HE_MAC_CAP4_QTP				0x02
+#define IEEE80211_HE_MAC_CAP4_BQR				0x04
+#define IEEE80211_HE_MAC_CAP4_SR_RESP				0x08
+#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
+#define IEEE80211_HE_MAC_CAP4_OPS				0x20
+#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU			0x40
+
+/* 802.11ax HE PHY capabilities */
+#define IEEE80211_HE_PHY_CAP0_DUAL_BAND					0x01
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G	0x04
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G		0x08
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G	0x10
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G	0x20
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G	0x40
+#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK			0xfe
+
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ	0x01
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ	0x02
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ	0x04
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ	0x08
+#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK			0x0f
+#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A				0x10
+#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD			0x20
+#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US		0x40
+/* Midamble RX Max NSTS is split between byte #2 and byte #3 */
+#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS			0x80
+
+#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS			0x01
+#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US			0x02
+#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ			0x04
+#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ			0x08
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX				0x10
+#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX				0x20
+
+/* Note that the meaning of UL MU below is different between an AP and a non-AP
+ * sta, where in the AP case it indicates support for Rx and in the non-AP sta
+ * case it indicates support for Tx.
+ */
+#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO			0x40
+#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO			0x80
+
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK			0x01
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK			0x02
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK			0x03
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2				0x04
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM			0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK			0x08
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK			0x10
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK			0x18
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1				0x00
+#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2				0x20
+#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA		0x40
+#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER				0x80
+
+#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE				0x01
+#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER				0x02
+
+/* Minimal allowed value of Max STS under 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4		0x0c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5		0x10
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6		0x14
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7		0x18
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8		0x1c
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK	0x1c
+
+/* Minimal allowed value of Max STS above 80MHz is 3 */
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4		0x60
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5		0x80
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6		0xa0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7		0xc0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8		0xe0
+#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK	0xe0
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2	0x01
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3	0x02
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4	0x03
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5	0x04
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6	0x05
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7	0x06
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8	0x07
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK	0x07
+
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1	0x00
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2	0x08
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3	0x10
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4	0x18
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5	0x20
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6	0x28
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7	0x30
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8	0x38
+#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK	0x38
+
+#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK				0x40
+#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK				0x80
+
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU			0x01
+#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU			0x02
+#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB			0x04
+#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB			0x08
+#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB				0x10
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE			0x20
+#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO		0x40
+#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT			0x80
+
+#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR				0x01
+#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR			0x02
+#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI		0x04
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_1					0x08
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_2					0x10
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_3					0x18
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_4					0x20
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_5					0x28
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_6					0x30
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_7					0x38
+#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK				0x38
+#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ			0x40
+#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ			0x80
+
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI		0x01
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G		0x02
+#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU			0x04
+#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU			0x08
+#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI		0x10
+#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF			0x20
+
+/* 802.11ax HE TX/RX MCS NSS Support  */
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS			(6)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS			(11)
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK			0x07c0
+#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK			0xf800
+
+/* TX/RX HE MCS Support field Highest MCS subfield encoding */
+enum ieee80211_he_highest_mcs_supported_subfield_enc {
+	HIGHEST_MCS_SUPPORTED_MCS7 = 0,
+	HIGHEST_MCS_SUPPORTED_MCS8,
+	HIGHEST_MCS_SUPPORTED_MCS9,
+	HIGHEST_MCS_SUPPORTED_MCS10,
+	HIGHEST_MCS_SUPPORTED_MCS11,
+};
+
+/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
+static inline u8
+ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
+{
+	u8 count = 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
+		count += 4;
+
+	if (he_cap->phy_cap_info[0] &
+	    IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
+		count += 4;
+
+	return count;
+}
+
+/* 802.11ax HE PPE Thresholds */
+#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS			(1)
+#define IEEE80211_PPE_THRES_NSS_POS				(0)
+#define IEEE80211_PPE_THRES_NSS_MASK				(7)
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU	\
+	(BIT(5) | BIT(6))
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK		0x78
+#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS		(3)
+#define IEEE80211_PPE_THRES_INFO_PPET_SIZE			(3)
+
+/*
+ * Calculate 802.11ax HE capabilities IE PPE field size
+ * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
+ */
+static inline u8
+ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
+{
+	u8 n;
+
+	if ((phy_cap_info[6] &
+	     IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
+		return 0;
+
+	n = hweight8(ppe_thres_hdr &
+		     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
+	n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
+		   IEEE80211_PPE_THRES_NSS_POS));
+
+	/*
+	 * Each pair is 6 bits, and we need to add the 7 "header" bits to the
+	 * total size.
+	 */
+	n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
+	n = DIV_ROUND_UP(n, 8);
+
+	return n;
+}
+
+/* HE Operation defines */
+#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x0000003f
+#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK		0x000001c0
+#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET		6
+#define IEEE80211_HE_OPERATION_TWT_REQUIRED			0x00000200
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK		0x000ffc00
+#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET		10
+#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x000100000
+#define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x000200000
+#define IEEE80211_HE_OPERATION_MULTI_BSSID_AP			0x10000000
+#define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR		0x20000000
+#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x40000000
+
+/*
+ * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
+ * @he_oper_ie: byte data of the He Operations IE, stating from the the byte
+ *	after the ext ID byte. It is assumed that he_oper_ie has at least
+ *	sizeof(struct ieee80211_he_operation) bytes, checked already in
+ *	ieee802_11_parse_elems_crc()
+ * @return the actual size of the IE data (not including header), or 0 on error
+ */
+static inline u8
+ieee80211_he_oper_size(const u8 *he_oper_ie)
+{
+	struct ieee80211_he_operation *he_oper = (void *)he_oper_ie;
+	u8 oper_len = sizeof(struct ieee80211_he_operation);
+	u32 he_oper_params;
+
+	/* Make sure the input is not NULL */
+	if (!he_oper_ie)
+		return 0;
+
+	/* Calc required length */
+	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
+	if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
+		oper_len += 3;
+	if (he_oper_params & IEEE80211_HE_OPERATION_MULTI_BSSID_AP)
+		oper_len++;
+
+	/* Add the first byte (extension ID) to the total length */
+	oper_len++;
+
+	return oper_len;
+}
+
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
@@ -1992,6 +2414,11 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
 	WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
 	WLAN_EID_EXT_FILS_NONCE = 13,
+	WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14,
+	WLAN_EID_EXT_HE_CAPABILITY = 35,
+	WLAN_EID_EXT_HE_OPERATION = 36,
+	WLAN_EID_EXT_UORA = 37,
+	WLAN_EID_EXT_HE_MU_EDCA = 38,
 };
 
 /* Action category code */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5fbfe61f41c6..9ba1f289c439 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -285,6 +285,41 @@ struct ieee80211_sta_vht_cap {
 	struct ieee80211_vht_mcs_info vht_mcs;
 };
 
+#define IEEE80211_HE_PPE_THRES_MAX_LEN		25
+
+/**
+ * struct ieee80211_sta_he_cap - STA's HE capabilities
+ *
+ * This structure describes most essential parameters needed
+ * to describe 802.11ax HE capabilities for a STA.
+ *
+ * @has_he: true iff HE data is valid.
+ * @he_cap_elem: Fixed portion of the HE capabilities element.
+ * @he_mcs_nss_supp: The supported NSS/MCS combinations.
+ * @ppe_thres: Holds the PPE Thresholds data.
+ */
+struct ieee80211_sta_he_cap {
+	bool has_he;
+	struct ieee80211_he_cap_elem he_cap_elem;
+	struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp;
+	u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
+};
+
+/**
+ * struct ieee80211_sband_iftype_data
+ *
+ * This structure encapsulates sband data that is relevant for the
+ * interface types defined in @types_mask.  Each type in the
+ * @types_mask must be unique across all instances of iftype_data.
+ *
+ * @types_mask: interface types mask
+ * @he_cap: holds the HE capabilities
+ */
+struct ieee80211_sband_iftype_data {
+	u16 types_mask;
+	struct ieee80211_sta_he_cap he_cap;
+};
+
 /**
  * struct ieee80211_supported_band - frequency band definition
  *
@@ -301,6 +336,11 @@ struct ieee80211_sta_vht_cap {
  * @n_bitrates: Number of bitrates in @bitrates
  * @ht_cap: HT capabilities in this band
  * @vht_cap: VHT capabilities in this band
+ * @n_iftype_data: number of iftype data entries
+ * @iftype_data: interface type data entries.  Note that the bits in
+ *	@types_mask inside this structure cannot overlap (i.e. only
+ *	one occurrence of each type is allowed across all instances of
+ *	iftype_data).
  */
 struct ieee80211_supported_band {
 	struct ieee80211_channel *channels;
@@ -310,8 +350,55 @@ struct ieee80211_supported_band {
 	int n_bitrates;
 	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_sta_vht_cap vht_cap;
+	u16 n_iftype_data;
+	const struct ieee80211_sband_iftype_data *iftype_data;
 };
 
+/**
+ * ieee80211_get_sband_iftype_data - return sband data for a given iftype
+ * @sband: the sband to search for the STA on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found
+ */
+static inline const struct ieee80211_sband_iftype_data *
+ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
+				u8 iftype)
+{
+	int i;
+
+	if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
+		return NULL;
+
+	for (i = 0; i < sband->n_iftype_data; i++)  {
+		const struct ieee80211_sband_iftype_data *data =
+			&sband->iftype_data[i];
+
+		if (data->types_mask & BIT(iftype))
+			return data;
+	}
+
+	return NULL;
+}
+
+/**
+ * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA
+ * @sband: the sband to search for the STA on
+ *
+ * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_he_cap *
+ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION);
+
+	if (data && data->he_cap.has_he)
+		return &data->he_cap;
+
+	return NULL;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
@@ -899,6 +986,8 @@ enum station_parameters_apply_mask {
  * @opmode_notif: operating mode field from Operating Mode Notification
  * @opmode_notif_used: information if operating mode field is used
  * @support_p2p_ps: information if station supports P2P PS mechanism
+ * @he_capa: HE capabilities of station
+ * @he_capa_len: the length of the HE capabilities
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -926,6 +1015,8 @@ struct station_parameters {
 	u8 opmode_notif;
 	bool opmode_notif_used;
 	int support_p2p_ps;
+	const struct ieee80211_he_cap_elem *he_capa;
+	u8 he_capa_len;
 };
 
 /**
@@ -1000,12 +1091,14 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
  * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
  * @RATE_INFO_FLAGS_60G: 60GHz MCS
+ * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
 	RATE_INFO_FLAGS_VHT_MCS			= BIT(1),
 	RATE_INFO_FLAGS_SHORT_GI		= BIT(2),
 	RATE_INFO_FLAGS_60G			= BIT(3),
+	RATE_INFO_FLAGS_HE_MCS			= BIT(4),
 };
 
 /**
@@ -1019,6 +1112,7 @@ enum rate_info_flags {
  * @RATE_INFO_BW_40: 40 MHz bandwidth
  * @RATE_INFO_BW_80: 80 MHz bandwidth
  * @RATE_INFO_BW_160: 160 MHz bandwidth
+ * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
  */
 enum rate_info_bw {
 	RATE_INFO_BW_20 = 0,
@@ -1027,6 +1121,7 @@ enum rate_info_bw {
 	RATE_INFO_BW_40,
 	RATE_INFO_BW_80,
 	RATE_INFO_BW_160,
+	RATE_INFO_BW_HE_RU,
 };
 
 /**
@@ -1035,10 +1130,14 @@ enum rate_info_bw {
  * Information about a receiving or transmitting bitrate
  *
  * @flags: bitflag of flags from &enum rate_info_flags
- * @mcs: mcs index if struct describes a 802.11n bitrate
+ * @mcs: mcs index if struct describes an HT/VHT/HE rate
  * @legacy: bitrate in 100kbit/s for 802.11abg
- * @nss: number of streams (VHT only)
+ * @nss: number of streams (VHT & HE only)
  * @bw: bandwidth (from &enum rate_info_bw)
+ * @he_gi: HE guard interval (from &enum nl80211_he_gi)
+ * @he_dcm: HE DCM value
+ * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
+ *	only valid if bw is %RATE_INFO_BW_HE_RU)
  */
 struct rate_info {
 	u8 flags;
@@ -1046,6 +1145,9 @@ struct rate_info {
 	u16 legacy;
 	u8 nss;
 	u8 bw;
+	u8 he_gi;
+	u8 he_dcm;
+	u8 he_ru_alloc;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 49f718e821a3..f82ce3c89ab7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2237,6 +2237,9 @@ enum nl80211_commands {
  *      enforced.
  * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes
  *      a flow is assigned on each round of the DRR scheduler.
+ * @NL80211_ATTR_HE_CAPABILITY: HE Capability information element (from
+ *	association request when used with NL80211_CMD_NEW_STATION). Can be set
+ *	only if %NL80211_STA_FLAG_WME is set.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2677,6 +2680,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_TXQ_MEMORY_LIMIT,
 	NL80211_ATTR_TXQ_QUANTUM,
 
+	NL80211_ATTR_HE_CAPABILITY,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -2726,7 +2731,8 @@ enum nl80211_attrs {
 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY	24
 #define NL80211_HT_CAPABILITY_LEN		26
 #define NL80211_VHT_CAPABILITY_LEN		12
-
+#define NL80211_HE_MIN_CAPABILITY_LEN           16
+#define NL80211_HE_MAX_CAPABILITY_LEN           51
 #define NL80211_MAX_NR_CIPHER_SUITES		5
 #define NL80211_MAX_NR_AKM_SUITES		2
 
@@ -2853,6 +2859,38 @@ struct nl80211_sta_flag_update {
 	__u32 set;
 } __attribute__((packed));
 
+/**
+ * enum nl80211_he_gi - HE guard interval
+ * @NL80211_RATE_INFO_HE_GI_0_8: 0.8 usec
+ * @NL80211_RATE_INFO_HE_GI_1_6: 1.6 usec
+ * @NL80211_RATE_INFO_HE_GI_3_2: 3.2 usec
+ */
+enum nl80211_he_gi {
+	NL80211_RATE_INFO_HE_GI_0_8,
+	NL80211_RATE_INFO_HE_GI_1_6,
+	NL80211_RATE_INFO_HE_GI_3_2,
+};
+
+/**
+ * enum nl80211_he_ru_alloc - HE RU allocation values
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_106: 106-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_242: 242-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_484: 484-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_996: 996-tone RU allocation
+ * @NL80211_RATE_INFO_HE_RU_ALLOC_2x996: 2x996-tone RU allocation
+ */
+enum nl80211_he_ru_alloc {
+	NL80211_RATE_INFO_HE_RU_ALLOC_26,
+	NL80211_RATE_INFO_HE_RU_ALLOC_52,
+	NL80211_RATE_INFO_HE_RU_ALLOC_106,
+	NL80211_RATE_INFO_HE_RU_ALLOC_242,
+	NL80211_RATE_INFO_HE_RU_ALLOC_484,
+	NL80211_RATE_INFO_HE_RU_ALLOC_996,
+	NL80211_RATE_INFO_HE_RU_ALLOC_2x996,
+};
+
 /**
  * enum nl80211_rate_info - bitrate information
  *
@@ -2885,6 +2923,13 @@ struct nl80211_sta_flag_update {
  * @NL80211_RATE_INFO_5_MHZ_WIDTH: 5 MHz width - note that this is
  *	a legacy rate and will be reported as the actual bitrate, i.e.
  *	a quarter of the base (20 MHz) rate
+ * @NL80211_RATE_INFO_HE_MCS: HE MCS index (u8, 0-11)
+ * @NL80211_RATE_INFO_HE_NSS: HE NSS value (u8, 1-8)
+ * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier
+ *	(u8, see &enum nl80211_he_gi)
+ * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1)
+ * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then
+ *	non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc)
  * @__NL80211_RATE_INFO_AFTER_LAST: internal use
  */
 enum nl80211_rate_info {
@@ -2901,6 +2946,11 @@ enum nl80211_rate_info {
 	NL80211_RATE_INFO_160_MHZ_WIDTH,
 	NL80211_RATE_INFO_10_MHZ_WIDTH,
 	NL80211_RATE_INFO_5_MHZ_WIDTH,
+	NL80211_RATE_INFO_HE_MCS,
+	NL80211_RATE_INFO_HE_NSS,
+	NL80211_RATE_INFO_HE_GI,
+	NL80211_RATE_INFO_HE_DCM,
+	NL80211_RATE_INFO_HE_RU_ALLOC,
 
 	/* keep last */
 	__NL80211_RATE_INFO_AFTER_LAST,
@@ -3166,6 +3216,38 @@ enum nl80211_mpath_info {
 	NL80211_MPATH_INFO_MAX = __NL80211_MPATH_INFO_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_band_iftype_attr - Interface type data attributes
+ *
+ * @__NL80211_BAND_IFTYPE_ATTR_INVALID: attribute number 0 is reserved
+ * @NL80211_BAND_IFTYPE_ATTR_IFTYPES: nested attribute containing a flag attribute
+ *     for each interface type that supports the band data
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC: HE MAC capabilities as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY: HE PHY capabilities as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET: HE supported NSS/MCS as in HE
+ *     capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as
+ *     defined in HE capabilities IE
+ * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently
+ *     defined
+ * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use
+ */
+enum nl80211_band_iftype_attr {
+	__NL80211_BAND_IFTYPE_ATTR_INVALID,
+
+	NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+
+	/* keep last */
+	__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST,
+	NL80211_BAND_IFTYPE_ATTR_MAX = __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST - 1
+};
+
 /**
  * enum nl80211_band_attr - band attributes
  * @__NL80211_BAND_ATTR_INVALID: attribute number 0 is reserved
@@ -3181,6 +3263,8 @@ enum nl80211_mpath_info {
  * @NL80211_BAND_ATTR_VHT_MCS_SET: 32-byte attribute containing the MCS set as
  *	defined in 802.11ac
  * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE
+ * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using
+ *	attributes from &enum nl80211_band_iftype_attr
  * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined
  * @__NL80211_BAND_ATTR_AFTER_LAST: internal use
  */
@@ -3196,6 +3280,7 @@ enum nl80211_band_attr {
 
 	NL80211_BAND_ATTR_VHT_MCS_SET,
 	NL80211_BAND_ATTR_VHT_CAPA,
+	NL80211_BAND_ATTR_IFTYPE_DATA,
 
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5fe35aafdd9c..d23abc619e77 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy)
 
 	/* sanity check supported bands/channels */
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
+		u16 types = 0;
+
 		sband = wiphy->bands[band];
 		if (!sband)
 			continue;
@@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy)
 			sband->channels[i].band = band;
 		}
 
+		for (i = 0; i < sband->n_iftype_data; i++) {
+			const struct ieee80211_sband_iftype_data *iftd;
+
+			iftd = &sband->iftype_data[i];
+
+			if (WARN_ON(!iftd->types_mask))
+				return -EINVAL;
+			if (WARN_ON(types & iftd->types_mask))
+				return -EINVAL;
+
+			/* at least one piece of information must be present */
+			if (WARN_ON(!iftd->he_cap.has_he))
+				return -EINVAL;
+
+			types |= iftd->types_mask;
+		}
+
 		have_band = true;
 	}
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7b21914ae18b..0ccce338a66e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
 	[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
 	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
+	[NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
+					 .len = NL80211_HE_MAX_CAPABILITY_LEN },
 };
 
 /* policy for the key attributes */
@@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
 	return 0;
 }
 
+static int
+nl80211_send_iftype_data(struct sk_buff *msg,
+			 const struct ieee80211_sband_iftype_data *iftdata)
+{
+	const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
+
+	if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+				iftdata->types_mask))
+		return -ENOBUFS;
+
+	if (he_cap->has_he) {
+		if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+			    sizeof(he_cap->he_cap_elem.mac_cap_info),
+			    he_cap->he_cap_elem.mac_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+			    sizeof(he_cap->he_cap_elem.phy_cap_info),
+			    he_cap->he_cap_elem.phy_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+			    sizeof(he_cap->he_mcs_nss_supp),
+			    &he_cap->he_mcs_nss_supp) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+			    sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
+			return -ENOBUFS;
+	}
+
+	return 0;
+}
+
 static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 				      struct ieee80211_supported_band *sband)
 {
@@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 			 sband->vht_cap.cap)))
 		return -ENOBUFS;
 
+	if (sband->n_iftype_data) {
+		struct nlattr *nl_iftype_data =
+			nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA);
+		int err;
+
+		if (!nl_iftype_data)
+			return -ENOBUFS;
+
+		for (i = 0; i < sband->n_iftype_data; i++) {
+			struct nlattr *iftdata;
+
+			iftdata = nla_nest_start(msg, i + 1);
+			if (!iftdata)
+				return -ENOBUFS;
+
+			err = nl80211_send_iftype_data(msg,
+						       &sband->iftype_data[i]);
+			if (err)
+				return err;
+
+			nla_nest_end(msg, iftdata);
+		}
+
+		nla_nest_end(msg, nl_iftype_data);
+	}
+
 	/* add bitrates */
 	nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
 	if (!nl_rates)
@@ -4472,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
 	case RATE_INFO_BW_160:
 		rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
 		break;
+	case RATE_INFO_BW_HE_RU:
+		rate_flg = 0;
+		WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
 	}
 
 	if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -4491,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
 		if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
 		    nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
 			return false;
+	} else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
+			return false;
+		if (info->bw == RATE_INFO_BW_HE_RU &&
+		    nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
+			       info->he_ru_alloc))
+			return false;
 	}
 
 	nla_nest_end(msg, rate);
@@ -4887,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 			return -EINVAL;
 		if (params->supported_rates)
 			return -EINVAL;
-		if (params->ext_capab || params->ht_capa || params->vht_capa)
+		if (params->ext_capab || params->ht_capa || params->vht_capa ||
+		    params->he_capa)
 			return -EINVAL;
 	}
 
@@ -5093,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
 		params->vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params->he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params->he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+			return -EINVAL;
+	}
 
 	err = nl80211_parse_sta_channel_info(info, params);
 	if (err)
@@ -5320,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.vht_capa =
 			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
 
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params.he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params.he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		/* max len is validated in nla policy */
+		if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+			return -EINVAL;
+	}
+
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
 		params.opmode_notif_used = true;
 		params.opmode_notif =
@@ -5352,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
 		params.ht_capa = NULL;
 		params.vht_capa = NULL;
+
+		/* HE requires WME */
+		if (params.he_capa_len)
+			return -EINVAL;
 	}
 
 	/* When you run into this, adjust the code below for the new flag */
diff --git a/net/wireless/util.c b/net/wireless/util.c
index b91597a8baa2..4ed06b271f32 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,6 +4,7 @@
  *
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
+ * Copyright 2017	Intel Deutschland GmbH
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
 	return 0;
 }
 
+static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
+{
+#define SCALE 2048
+	u16 mcs_divisors[12] = {
+		34133, /* 16.666666... */
+		17067, /*  8.333333... */
+		11378, /*  5.555555... */
+		 8533, /*  4.166666... */
+		 5689, /*  2.777777... */
+		 4267, /*  2.083333... */
+		 3923, /*  1.851851... */
+		 3413, /*  1.666666... */
+		 2844, /*  1.388888... */
+		 2560, /*  1.250000... */
+		 2276, /*  1.111111... */
+		 2048, /*  1.000000... */
+	};
+	u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
+	u32 rates_969[3] =  { 480388888, 453700000, 408333333 };
+	u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
+	u32 rates_242[3] =  { 114711111, 108333333,  97500000 };
+	u32 rates_106[3] =  {  40000000,  37777777,  34000000 };
+	u32 rates_52[3]  =  {  18820000,  17777777,  16000000 };
+	u32 rates_26[3]  =  {   9411111,   8888888,   8000000 };
+	u64 tmp;
+	u32 result;
+
+	if (WARN_ON_ONCE(rate->mcs > 11))
+		return 0;
+
+	if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
+		return 0;
+	if (WARN_ON_ONCE(rate->he_ru_alloc >
+			 NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
+		return 0;
+	if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
+		return 0;
+
+	if (rate->bw == RATE_INFO_BW_160)
+		result = rates_160M[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_80 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
+		result = rates_969[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_40 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
+		result = rates_484[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_20 ||
+		 (rate->bw == RATE_INFO_BW_HE_RU &&
+		  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
+		result = rates_242[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
+		result = rates_106[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
+		result = rates_52[rate->he_gi];
+	else if (rate->bw == RATE_INFO_BW_HE_RU &&
+		 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
+		result = rates_26[rate->he_gi];
+	else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
+		      rate->bw, rate->he_ru_alloc))
+		return 0;
+
+	/* now scale to the appropriate MCS */
+	tmp = result;
+	tmp *= SCALE;
+	do_div(tmp, mcs_divisors[rate->mcs]);
+	result = tmp;
+
+	/* and take NSS, DCM into account */
+	result = (result * rate->nss) / 8;
+	if (rate->he_dcm)
+		result /= 2;
+
+	return result;
+}
+
 u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 {
 	if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 		return cfg80211_calculate_bitrate_60g(rate);
 	if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
 		return cfg80211_calculate_bitrate_vht(rate);
+	if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
+		return cfg80211_calculate_bitrate_he(rate);
 
 	return rate->legacy;
 }
-- 
cgit v1.2.3


From eb3744a2dd01cb07ce9f556d56d6fe451f0c313a Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 13 Jun 2018 09:10:37 +0530
Subject: gpio: davinci: Do not assume continuous IRQ numbering

Currently the driver assumes that the interrupts are continuous
and does platform_get_irq only once and assumes the rest are continuous,
instead call platform_get_irq for all the interrupts and store them
in an array for later use.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-davinci.c                | 63 ++++++++++++++++++++----------
 include/linux/platform_data/gpio-davinci.h |  3 +-
 2 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 423d37c95e6b..a5ece8ea79bc 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -55,7 +55,7 @@ static inline struct davinci_gpio_regs __iomem *irq2regs(struct irq_data *d)
 	return g;
 }
 
-static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq);
+static int davinci_gpio_irq_setup(struct platform_device *pdev);
 
 /*--------------------------------------------------------------------------*/
 
@@ -167,8 +167,8 @@ of_err:
 static int davinci_gpio_probe(struct platform_device *pdev)
 {
 	static int ctrl_num, bank_base;
-	int gpio, bank, bank_irq, ret = 0;
-	unsigned ngpio, nbank;
+	int gpio, bank, i, ret = 0;
+	unsigned int ngpio, nbank, nirq;
 	struct davinci_gpio_controller *chips;
 	struct davinci_gpio_platform_data *pdata;
 	struct device *dev = &pdev->dev;
@@ -197,6 +197,16 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	if (WARN_ON(ARCH_NR_GPIOS < ngpio))
 		ngpio = ARCH_NR_GPIOS;
 
+	/*
+	 * If there are unbanked interrupts then the number of
+	 * interrupts is equal to number of gpios else all are banked so
+	 * number of interrupts is equal to number of banks(each with 16 gpios)
+	 */
+	if (pdata->gpio_unbanked)
+		nirq = pdata->gpio_unbanked;
+	else
+		nirq = DIV_ROUND_UP(ngpio, 16);
+
 	nbank = DIV_ROUND_UP(ngpio, 32);
 	chips = devm_kcalloc(dev,
 			     nbank, sizeof(struct davinci_gpio_controller),
@@ -209,10 +219,13 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	if (IS_ERR(gpio_base))
 		return PTR_ERR(gpio_base);
 
-	bank_irq = platform_get_irq(pdev, 0);
-	if (bank_irq < 0) {
-		dev_dbg(dev, "IRQ not populated\n");
-		return bank_irq;
+	for (i = 0; i < nirq; i++) {
+		chips->irqs[i] = platform_get_irq(pdev, i);
+		if (chips->irqs[i] < 0) {
+			dev_info(dev, "IRQ not populated, err = %d\n",
+				 chips->irqs[i]);
+			return chips->irqs[i];
+		}
 	}
 
 	snprintf(label, MAX_LABEL_SIZE, "davinci_gpio.%d", ctrl_num++);
@@ -249,7 +262,7 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 		goto err;
 
 	platform_set_drvdata(pdev, chips);
-	ret = davinci_gpio_irq_setup(pdev, bank_irq);
+	ret = davinci_gpio_irq_setup(pdev);
 	if (ret)
 		goto err;
 
@@ -383,7 +396,7 @@ static int gpio_to_irq_unbanked(struct gpio_chip *chip, unsigned offset)
 	 * can provide direct-mapped IRQs to AINTC (up to 32 GPIOs).
 	 */
 	if (offset < d->gpio_unbanked)
-		return d->base_irq + offset;
+		return d->irqs[offset];
 	else
 		return -ENODEV;
 }
@@ -392,11 +405,18 @@ static int gpio_irq_type_unbanked(struct irq_data *data, unsigned trigger)
 {
 	struct davinci_gpio_controller *d;
 	struct davinci_gpio_regs __iomem *g;
-	u32 mask;
+	u32 mask, i;
 
 	d = (struct davinci_gpio_controller *)irq_data_get_irq_handler_data(data);
 	g = (struct davinci_gpio_regs __iomem *)d->regs[0];
-	mask = __gpio_mask(data->irq - d->base_irq);
+	for (i = 0; i < MAX_INT_PER_BANK; i++)
+		if (data->irq == d->irqs[i])
+			break;
+
+	if (i == MAX_INT_PER_BANK)
+		return -EINVAL;
+
+	mask = __gpio_mask(i);
 
 	if (trigger & ~(IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING))
 		return -EINVAL;
@@ -458,7 +478,7 @@ static const struct of_device_id davinci_gpio_ids[];
  * (dm6446) can be set appropriately for GPIOV33 pins.
  */
 
-static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
+static int davinci_gpio_irq_setup(struct platform_device *pdev)
 {
 	unsigned	gpio, bank;
 	int		irq;
@@ -492,6 +512,7 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
 		dev_err(dev, "Error %ld getting gpio clock\n", PTR_ERR(clk));
 		return PTR_ERR(clk);
 	}
+
 	ret = clk_prepare_enable(clk);
 	if (ret)
 		return ret;
@@ -531,12 +552,11 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
 	if (pdata->gpio_unbanked) {
 		/* pass "bank 0" GPIO IRQs to AINTC */
 		chips->chip.to_irq = gpio_to_irq_unbanked;
-		chips->base_irq = bank_irq;
 		chips->gpio_unbanked = pdata->gpio_unbanked;
 		binten = GENMASK(pdata->gpio_unbanked / 16, 0);
 
 		/* AINTC handles mask/unmask; GPIO handles triggering */
-		irq = bank_irq;
+		irq = chips->irqs[0];
 		irq_chip = gpio_get_irq_chip(irq);
 		irq_chip->name = "GPIO-AINTC";
 		irq_chip->irq_set_type = gpio_irq_type_unbanked;
@@ -547,10 +567,11 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
 		writel_relaxed(~0, &g->set_rising);
 
 		/* set the direct IRQs up to use that irqchip */
-		for (gpio = 0; gpio < pdata->gpio_unbanked; gpio++, irq++) {
-			irq_set_chip(irq, irq_chip);
-			irq_set_handler_data(irq, chips);
-			irq_set_status_flags(irq, IRQ_TYPE_EDGE_BOTH);
+		for (gpio = 0; gpio < pdata->gpio_unbanked; gpio++) {
+			irq_set_chip(chips->irqs[gpio], irq_chip);
+			irq_set_handler_data(chips->irqs[gpio], chips);
+			irq_set_status_flags(chips->irqs[gpio],
+					     IRQ_TYPE_EDGE_BOTH);
 		}
 
 		goto done;
@@ -560,7 +581,7 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
 	 * Or, AINTC can handle IRQs for banks of 16 GPIO IRQs, which we
 	 * then chain through our own handler.
 	 */
-	for (gpio = 0, bank = 0; gpio < ngpio; bank++, bank_irq++, gpio += 16) {
+	for (gpio = 0, bank = 0; gpio < ngpio; bank++, gpio += 16) {
 		/* disabled by default, enabled only as needed
 		 * There are register sets for 32 GPIOs. 2 banks of 16
 		 * GPIOs are covered by each set of registers hence divide by 2
@@ -587,8 +608,8 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev, int bank_irq)
 		irqdata->bank_num = bank;
 		irqdata->chip = chips;
 
-		irq_set_chained_handler_and_data(bank_irq, gpio_irq_handler,
-						 irqdata);
+		irq_set_chained_handler_and_data(chips->irqs[bank],
+						 gpio_irq_handler, irqdata);
 
 		binten |= BIT(bank);
 	}
diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
index 90ae19ca828f..57a5a35e0073 100644
--- a/include/linux/platform_data/gpio-davinci.h
+++ b/include/linux/platform_data/gpio-davinci.h
@@ -22,6 +22,7 @@
 #include <asm-generic/gpio.h>
 
 #define MAX_REGS_BANKS		5
+#define MAX_INT_PER_BANK 32
 
 struct davinci_gpio_platform_data {
 	u32	ngpio;
@@ -41,7 +42,7 @@ struct davinci_gpio_controller {
 	spinlock_t		lock;
 	void __iomem		*regs[MAX_REGS_BANKS];
 	int			gpio_unbanked;
-	unsigned int		base_irq;
+	int			irqs[MAX_INT_PER_BANK];
 	unsigned int		base;
 };
 
-- 
cgit v1.2.3


From 90b39402e9f31c4aab48dc1a43d85a724065793f Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 1 Jun 2018 13:21:27 +0200
Subject: gpio: Add API to explicitly name a consumer

The GPIO (descriptor) API registers a "label" naming what is
currently using the GPIO line. Typically this is taken from
things like the device tree node, so "reset-gpios" will result
in he line being labeled "reset".

The technical effect is pretty much zero: the use is for
debug and introspection, such as "lsgpio" and debugfs files.

However sometimes the user want this cuddly feeling of
listing all GPIO lines and seeing exactly what they are for
and it gives a very fulfilling sense of control. Especially
in the cases when the device tree node doesn't provide a
good name, or anonymous GPIO lines assigned just to
"gpios" in the device tree because the usage is implicit.

For these cases it may be nice to be able to label the
line directly and explicitly.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c        | 13 +++++++++++++
 include/linux/gpio/consumer.h |  7 +++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index e11a3bb03820..c6f77e806cb8 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3193,6 +3193,19 @@ int gpiod_cansleep(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_cansleep);
 
+/**
+ * gpiod_set_consumer_name() - set the consumer name for the descriptor
+ * @desc: gpio to set the consumer name on
+ * @name: the new consumer name
+ */
+void gpiod_set_consumer_name(struct gpio_desc *desc, const char *name)
+{
+	VALIDATE_DESC_VOID(desc);
+	/* Just overwrite whatever the previous name was */
+	desc->label = name;
+}
+EXPORT_SYMBOL_GPL(gpiod_set_consumer_name);
+
 /**
  * gpiod_to_irq() - return the IRQ corresponding to a GPIO
  * @desc: gpio whose IRQ will be returned (already requested)
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 243112c7fa7d..e8aaf34dd65d 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -145,6 +145,7 @@ int gpiod_is_active_low(const struct gpio_desc *desc);
 int gpiod_cansleep(const struct gpio_desc *desc);
 
 int gpiod_to_irq(const struct gpio_desc *desc);
+void gpiod_set_consumer_name(struct gpio_desc *desc, const char *name);
 
 /* Convert between the old gpio_ and new gpiod_ interfaces */
 struct gpio_desc *gpio_to_desc(unsigned gpio);
@@ -467,6 +468,12 @@ static inline int gpiod_to_irq(const struct gpio_desc *desc)
 	return -EINVAL;
 }
 
+static inline void gpiod_set_consumer_name(struct gpio_desc *desc, const char *name)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+}
+
 static inline struct gpio_desc *gpio_to_desc(unsigned gpio)
 {
 	return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From 297101ab85841319aac2c7843ca755d650c1964f Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Fri, 15 Jun 2018 13:44:53 +0200
Subject: regulator: pfuze100: add pfuze3001 support

This extends the pfuze100 driver with pfuze3001 support.

Latest datasheet:
https://www.nxp.com/docs/en/data-sheet/PF3001.pdf

Signed-off-by: Robin Gong <yibin.gong@nxp.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig              |  4 +-
 drivers/regulator/pfuze100-regulator.c | 78 +++++++++++++++++++++++++++++++---
 include/linux/regulator/pfuze100.h     | 11 +++++
 3 files changed, 84 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 5dbccf5f3037..2964eaea94c0 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -633,12 +633,12 @@ config REGULATOR_PCF50633
 	 on PCF50633
 
 config REGULATOR_PFUZE100
-	tristate "Freescale PFUZE100/200/3000 regulator driver"
+	tristate "Freescale PFUZE100/200/3000/3001 regulator driver"
 	depends on I2C
 	select REGMAP_I2C
 	help
 	  Say y here to support the regulators found on the Freescale
-	  PFUZE100/200/3000 PMIC.
+	  PFUZE100/200/3000/3001 PMIC.
 
 config REGULATOR_PV88060
 	tristate "Powerventure Semiconductor PV88060 regulator"
diff --git a/drivers/regulator/pfuze100-regulator.c b/drivers/regulator/pfuze100-regulator.c
index 8d9dbcc775ea..32f9af7f87c4 100644
--- a/drivers/regulator/pfuze100-regulator.c
+++ b/drivers/regulator/pfuze100-regulator.c
@@ -44,7 +44,7 @@
 #define PFUZE100_VGEN5VOL	0x70
 #define PFUZE100_VGEN6VOL	0x71
 
-enum chips { PFUZE100, PFUZE200, PFUZE3000 = 3 };
+enum chips { PFUZE100, PFUZE200, PFUZE3000 = 3, PFUZE3001 = 0x31, };
 
 struct pfuze_regulator {
 	struct regulator_desc desc;
@@ -92,6 +92,7 @@ static const struct i2c_device_id pfuze_device_id[] = {
 	{.name = "pfuze100", .driver_data = PFUZE100},
 	{.name = "pfuze200", .driver_data = PFUZE200},
 	{.name = "pfuze3000", .driver_data = PFUZE3000},
+	{.name = "pfuze3001", .driver_data = PFUZE3001},
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, pfuze_device_id);
@@ -100,6 +101,7 @@ static const struct of_device_id pfuze_dt_ids[] = {
 	{ .compatible = "fsl,pfuze100", .data = (void *)PFUZE100},
 	{ .compatible = "fsl,pfuze200", .data = (void *)PFUZE200},
 	{ .compatible = "fsl,pfuze3000", .data = (void *)PFUZE3000},
+	{ .compatible = "fsl,pfuze3001", .data = (void *)PFUZE3001},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, pfuze_dt_ids);
@@ -108,10 +110,28 @@ static int pfuze100_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
 {
 	struct pfuze_chip *pfuze100 = rdev_get_drvdata(rdev);
 	int id = rdev_get_id(rdev);
+	bool reg_has_ramp_delay;
 	unsigned int ramp_bits;
 	int ret;
 
-	if (id < PFUZE100_SWBST) {
+	switch (pfuze100->chip_id) {
+	case PFUZE3001:
+		/* no dynamic voltage scaling for PF3001 */
+		reg_has_ramp_delay = false;
+		break;
+	case PFUZE3000:
+		reg_has_ramp_delay = (id < PFUZE3000_SWBST);
+		break;
+	case PFUZE200:
+		reg_has_ramp_delay = (id < PFUZE200_SWBST);
+		break;
+	case PFUZE100:
+	default:
+		reg_has_ramp_delay = (id < PFUZE100_SWBST);
+		break;
+	}
+
+	if (reg_has_ramp_delay) {
 		ramp_delay = 12500 / ramp_delay;
 		ramp_bits = (ramp_delay >> 1) - (ramp_delay >> 3);
 		ret = regmap_update_bits(pfuze100->regmap,
@@ -119,8 +139,9 @@ static int pfuze100_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
 					 0xc0, ramp_bits << 6);
 		if (ret < 0)
 			dev_err(pfuze100->dev, "ramp failed, err %d\n", ret);
-	} else
+	} else {
 		ret = -EACCES;
+	}
 
 	return ret;
 }
@@ -361,6 +382,19 @@ static struct pfuze_regulator pfuze3000_regulators[] = {
 	PFUZE100_VGEN_REG(PFUZE3000, VLDO4, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
 };
 
+static struct pfuze_regulator pfuze3001_regulators[] = {
+	PFUZE100_SWB_REG(PFUZE3001, SW1, PFUZE100_SW1ABVOL, 0x1f, pfuze3000_sw1a),
+	PFUZE100_SWB_REG(PFUZE3001, SW2, PFUZE100_SW2VOL, 0x7, pfuze3000_sw2lo),
+	PFUZE3000_SW3_REG(PFUZE3001, SW3, PFUZE100_SW3AVOL, 900000, 1650000, 50000),
+	PFUZE100_SWB_REG(PFUZE3001, VSNVS, PFUZE100_VSNVSVOL, 0x7, pfuze100_vsnvs),
+	PFUZE100_VGEN_REG(PFUZE3001, VLDO1, PFUZE100_VGEN1VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3001, VLDO2, PFUZE100_VGEN2VOL, 800000, 1550000, 50000),
+	PFUZE3000_VCC_REG(PFUZE3001, VCCSD, PFUZE100_VGEN3VOL, 2850000, 3300000, 150000),
+	PFUZE3000_VCC_REG(PFUZE3001, V33, PFUZE100_VGEN4VOL, 2850000, 3300000, 150000),
+	PFUZE100_VGEN_REG(PFUZE3001, VLDO3, PFUZE100_VGEN5VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3001, VLDO4, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
+};
+
 #ifdef CONFIG_OF
 /* PFUZE100 */
 static struct of_regulator_match pfuze100_matches[] = {
@@ -418,6 +452,21 @@ static struct of_regulator_match pfuze3000_matches[] = {
 	{ .name = "vldo4",	},
 };
 
+/* PFUZE3001 */
+static struct of_regulator_match pfuze3001_matches[] = {
+
+	{ .name = "sw1",	},
+	{ .name = "sw2",	},
+	{ .name = "sw3",	},
+	{ .name = "vsnvs",	},
+	{ .name = "vldo1",	},
+	{ .name = "vldo2",	},
+	{ .name = "vccsd",	},
+	{ .name = "v33",	},
+	{ .name = "vldo3",	},
+	{ .name = "vldo4",	},
+};
+
 static struct of_regulator_match *pfuze_matches;
 
 static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
@@ -437,6 +486,11 @@ static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
 	}
 
 	switch (chip->chip_id) {
+	case PFUZE3001:
+		pfuze_matches = pfuze3001_matches;
+		ret = of_regulator_match(dev, parent, pfuze3001_matches,
+					 ARRAY_SIZE(pfuze3001_matches));
+		break;
 	case PFUZE3000:
 		pfuze_matches = pfuze3000_matches;
 		ret = of_regulator_match(dev, parent, pfuze3000_matches,
@@ -508,7 +562,8 @@ static int pfuze_identify(struct pfuze_chip *pfuze_chip)
 		 */
 		dev_info(pfuze_chip->dev, "Assuming misprogrammed ID=0x8");
 	} else if ((value & 0x0f) != pfuze_chip->chip_id &&
-		   (value & 0xf0) >> 4 != pfuze_chip->chip_id) {
+		   (value & 0xf0) >> 4 != pfuze_chip->chip_id &&
+		   (value != pfuze_chip->chip_id)) {
 		/* device id NOT match with your setting */
 		dev_warn(pfuze_chip->dev, "Illegal ID: %x\n", value);
 		return -ENODEV;
@@ -588,6 +643,13 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 
 	/* use the right regulators after identify the right device */
 	switch (pfuze_chip->chip_id) {
+	case PFUZE3001:
+		pfuze_chip->pfuze_regulators = pfuze3001_regulators;
+		regulator_num = ARRAY_SIZE(pfuze3001_regulators);
+		sw_check_start = PFUZE3001_SW2;
+		sw_check_end = PFUZE3001_SW2;
+		sw_hi = 1 << 3;
+		break;
 	case PFUZE3000:
 		pfuze_chip->pfuze_regulators = pfuze3000_regulators;
 		regulator_num = ARRAY_SIZE(pfuze3000_regulators);
@@ -611,7 +673,8 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 	}
 	dev_info(&client->dev, "pfuze%s found.\n",
 		(pfuze_chip->chip_id == PFUZE100) ? "100" :
-		((pfuze_chip->chip_id == PFUZE200) ? "200" : "3000"));
+		(((pfuze_chip->chip_id == PFUZE200) ? "200" :
+		((pfuze_chip->chip_id == PFUZE3000) ? "3000" : "3001"))));
 
 	memcpy(pfuze_chip->regulator_descs, pfuze_chip->pfuze_regulators,
 		sizeof(pfuze_chip->regulator_descs));
@@ -636,7 +699,8 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 		if (i >= sw_check_start && i <= sw_check_end) {
 			regmap_read(pfuze_chip->regmap, desc->vsel_reg, &val);
 			if (val & sw_hi) {
-				if (pfuze_chip->chip_id == PFUZE3000) {
+				if (pfuze_chip->chip_id == PFUZE3000 ||
+					pfuze_chip->chip_id == PFUZE3001) {
 					desc->volt_table = pfuze3000_sw2hi;
 					desc->n_voltages = ARRAY_SIZE(pfuze3000_sw2hi);
 				} else {
@@ -675,5 +739,5 @@ static struct i2c_driver pfuze_driver = {
 module_i2c_driver(pfuze_driver);
 
 MODULE_AUTHOR("Robin Gong <b38343@freescale.com>");
-MODULE_DESCRIPTION("Regulator Driver for Freescale PFUZE100/200/3000 PMIC");
+MODULE_DESCRIPTION("Regulator Driver for Freescale PFUZE100/200/3000/3001 PMIC");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regulator/pfuze100.h b/include/linux/regulator/pfuze100.h
index e0ccf46f66cf..cb5aecd40f07 100644
--- a/include/linux/regulator/pfuze100.h
+++ b/include/linux/regulator/pfuze100.h
@@ -64,6 +64,17 @@
 #define PFUZE3000_VLDO3		11
 #define PFUZE3000_VLDO4		12
 
+#define PFUZE3001_SW1		0
+#define PFUZE3001_SW2		1
+#define PFUZE3001_SW3		2
+#define PFUZE3001_VSNVS		3
+#define PFUZE3001_VLDO1		4
+#define PFUZE3001_VLDO2		5
+#define PFUZE3001_VCCSD		6
+#define PFUZE3001_V33		7
+#define PFUZE3001_VLDO3		8
+#define PFUZE3001_VLDO4		9
+
 struct regulator_init_data;
 
 struct pfuze_regulator_platform_data {
-- 
cgit v1.2.3


From 7bd0c7ba62e8a9840f15fc4ff0122b29fe1b6413 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 7 Jun 2018 16:51:38 -0700
Subject: regulator: Fix typo in comment of struct regulator_linear_range

regulator_map_linar_range() => regulator_map_linear_range()

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index fc2dc8df476f..dea96ee39fdc 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -46,7 +46,7 @@ enum regulator_status {
 /**
  * struct regulator_linear_range - specify linear voltage ranges
  *
- * Specify a range of voltages for regulator_map_linar_range() and
+ * Specify a range of voltages for regulator_map_linear_range() and
  * regulator_list_linear_range().
  *
  * @min_uV:  Lowest voltage in range
-- 
cgit v1.2.3


From 3e247b0fb540a9a503a71ef6b918a3443e49655f Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 7 Jun 2018 19:40:32 +0000
Subject: spi: remove unused adi_spi3.h header

include/linux/spi/adi_spi3.h is unused since commit 47838669de9d ("spi: remove blackfin related host drivers")
Finish the cleaning by removing it.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/adi_spi3.h | 254 -------------------------------------------
 1 file changed, 254 deletions(-)
 delete mode 100644 include/linux/spi/adi_spi3.h

(limited to 'include/linux')

diff --git a/include/linux/spi/adi_spi3.h b/include/linux/spi/adi_spi3.h
deleted file mode 100644
index c84123aa1d06..000000000000
--- a/include/linux/spi/adi_spi3.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Analog Devices SPI3 controller driver
- *
- * Copyright (c) 2014 Analog Devices Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _ADI_SPI3_H_
-#define _ADI_SPI3_H_
-
-#include <linux/types.h>
-
-/* SPI_CONTROL */
-#define SPI_CTL_EN                  0x00000001    /* Enable */
-#define SPI_CTL_MSTR                0x00000002    /* Master/Slave */
-#define SPI_CTL_PSSE                0x00000004    /* controls modf error in master mode */
-#define SPI_CTL_ODM                 0x00000008    /* Open Drain Mode */
-#define SPI_CTL_CPHA                0x00000010    /* Clock Phase */
-#define SPI_CTL_CPOL                0x00000020    /* Clock Polarity */
-#define SPI_CTL_ASSEL               0x00000040    /* Slave Select Pin Control */
-#define SPI_CTL_SELST               0x00000080    /* Slave Select Polarity in-between transfers */
-#define SPI_CTL_EMISO               0x00000100    /* Enable MISO */
-#define SPI_CTL_SIZE                0x00000600    /* Word Transfer Size */
-#define SPI_CTL_SIZE08              0x00000000    /* SIZE: 8 bits */
-#define SPI_CTL_SIZE16              0x00000200    /* SIZE: 16 bits */
-#define SPI_CTL_SIZE32              0x00000400    /* SIZE: 32 bits */
-#define SPI_CTL_LSBF                0x00001000    /* LSB First */
-#define SPI_CTL_FCEN                0x00002000    /* Flow-Control Enable */
-#define SPI_CTL_FCCH                0x00004000    /* Flow-Control Channel Selection */
-#define SPI_CTL_FCPL                0x00008000    /* Flow-Control Polarity */
-#define SPI_CTL_FCWM                0x00030000    /* Flow-Control Water-Mark */
-#define SPI_CTL_FIFO0               0x00000000    /* FCWM: TFIFO empty or RFIFO Full */
-#define SPI_CTL_FIFO1               0x00010000    /* FCWM: TFIFO 75% or more empty or RFIFO 75% or more full */
-#define SPI_CTL_FIFO2               0x00020000    /* FCWM: TFIFO 50% or more empty or RFIFO 50% or more full */
-#define SPI_CTL_FMODE               0x00040000    /* Fast-mode Enable */
-#define SPI_CTL_MIOM                0x00300000    /* Multiple I/O Mode */
-#define SPI_CTL_MIO_DIS             0x00000000    /* MIOM: Disable */
-#define SPI_CTL_MIO_DUAL            0x00100000    /* MIOM: Enable DIOM (Dual I/O Mode) */
-#define SPI_CTL_MIO_QUAD            0x00200000    /* MIOM: Enable QUAD (Quad SPI Mode) */
-#define SPI_CTL_SOSI                0x00400000    /* Start on MOSI */
-/* SPI_RX_CONTROL */
-#define SPI_RXCTL_REN               0x00000001    /* Receive Channel Enable */
-#define SPI_RXCTL_RTI               0x00000004    /* Receive Transfer Initiate */
-#define SPI_RXCTL_RWCEN             0x00000008    /* Receive Word Counter Enable */
-#define SPI_RXCTL_RDR               0x00000070    /* Receive Data Request */
-#define SPI_RXCTL_RDR_DIS           0x00000000    /* RDR: Disabled */
-#define SPI_RXCTL_RDR_NE            0x00000010    /* RDR: RFIFO not empty */
-#define SPI_RXCTL_RDR_25            0x00000020    /* RDR: RFIFO 25% full */
-#define SPI_RXCTL_RDR_50            0x00000030    /* RDR: RFIFO 50% full */
-#define SPI_RXCTL_RDR_75            0x00000040    /* RDR: RFIFO 75% full */
-#define SPI_RXCTL_RDR_FULL          0x00000050    /* RDR: RFIFO full */
-#define SPI_RXCTL_RDO               0x00000100    /* Receive Data Over-Run */
-#define SPI_RXCTL_RRWM              0x00003000    /* FIFO Regular Water-Mark */
-#define SPI_RXCTL_RWM_0             0x00000000    /* RRWM: RFIFO Empty */
-#define SPI_RXCTL_RWM_25            0x00001000    /* RRWM: RFIFO 25% full */
-#define SPI_RXCTL_RWM_50            0x00002000    /* RRWM: RFIFO 50% full */
-#define SPI_RXCTL_RWM_75            0x00003000    /* RRWM: RFIFO 75% full */
-#define SPI_RXCTL_RUWM              0x00070000    /* FIFO Urgent Water-Mark */
-#define SPI_RXCTL_UWM_DIS           0x00000000    /* RUWM: Disabled */
-#define SPI_RXCTL_UWM_25            0x00010000    /* RUWM: RFIFO 25% full */
-#define SPI_RXCTL_UWM_50            0x00020000    /* RUWM: RFIFO 50% full */
-#define SPI_RXCTL_UWM_75            0x00030000    /* RUWM: RFIFO 75% full */
-#define SPI_RXCTL_UWM_FULL          0x00040000    /* RUWM: RFIFO full */
-/* SPI_TX_CONTROL */
-#define SPI_TXCTL_TEN               0x00000001    /* Transmit Channel Enable */
-#define SPI_TXCTL_TTI               0x00000004    /* Transmit Transfer Initiate */
-#define SPI_TXCTL_TWCEN             0x00000008    /* Transmit Word Counter Enable */
-#define SPI_TXCTL_TDR               0x00000070    /* Transmit Data Request */
-#define SPI_TXCTL_TDR_DIS           0x00000000    /* TDR: Disabled */
-#define SPI_TXCTL_TDR_NF            0x00000010    /* TDR: TFIFO not full */
-#define SPI_TXCTL_TDR_25            0x00000020    /* TDR: TFIFO 25% empty */
-#define SPI_TXCTL_TDR_50            0x00000030    /* TDR: TFIFO 50% empty */
-#define SPI_TXCTL_TDR_75            0x00000040    /* TDR: TFIFO 75% empty */
-#define SPI_TXCTL_TDR_EMPTY         0x00000050    /* TDR: TFIFO empty */
-#define SPI_TXCTL_TDU               0x00000100    /* Transmit Data Under-Run */
-#define SPI_TXCTL_TRWM              0x00003000    /* FIFO Regular Water-Mark */
-#define SPI_TXCTL_RWM_FULL          0x00000000    /* TRWM: TFIFO full */
-#define SPI_TXCTL_RWM_25            0x00001000    /* TRWM: TFIFO 25% empty */
-#define SPI_TXCTL_RWM_50            0x00002000    /* TRWM: TFIFO 50% empty */
-#define SPI_TXCTL_RWM_75            0x00003000    /* TRWM: TFIFO 75% empty */
-#define SPI_TXCTL_TUWM              0x00070000    /* FIFO Urgent Water-Mark */
-#define SPI_TXCTL_UWM_DIS           0x00000000    /* TUWM: Disabled */
-#define SPI_TXCTL_UWM_25            0x00010000    /* TUWM: TFIFO 25% empty */
-#define SPI_TXCTL_UWM_50            0x00020000    /* TUWM: TFIFO 50% empty */
-#define SPI_TXCTL_UWM_75            0x00030000    /* TUWM: TFIFO 75% empty */
-#define SPI_TXCTL_UWM_EMPTY         0x00040000    /* TUWM: TFIFO empty */
-/* SPI_CLOCK */
-#define SPI_CLK_BAUD                0x0000FFFF    /* Baud Rate */
-/* SPI_DELAY */
-#define SPI_DLY_STOP                0x000000FF    /* Transfer delay time in multiples of SCK period */
-#define SPI_DLY_LEADX               0x00000100    /* Extended (1 SCK) LEAD Control */
-#define SPI_DLY_LAGX                0x00000200    /* Extended (1 SCK) LAG control */
-/* SPI_SSEL */
-#define SPI_SLVSEL_SSE1             0x00000002    /* SPISSEL1 Enable */
-#define SPI_SLVSEL_SSE2             0x00000004    /* SPISSEL2 Enable */
-#define SPI_SLVSEL_SSE3             0x00000008    /* SPISSEL3 Enable */
-#define SPI_SLVSEL_SSE4             0x00000010    /* SPISSEL4 Enable */
-#define SPI_SLVSEL_SSE5             0x00000020    /* SPISSEL5 Enable */
-#define SPI_SLVSEL_SSE6             0x00000040    /* SPISSEL6 Enable */
-#define SPI_SLVSEL_SSE7             0x00000080    /* SPISSEL7 Enable */
-#define SPI_SLVSEL_SSEL1            0x00000200    /* SPISSEL1 Value */
-#define SPI_SLVSEL_SSEL2            0x00000400    /* SPISSEL2 Value */
-#define SPI_SLVSEL_SSEL3            0x00000800    /* SPISSEL3 Value */
-#define SPI_SLVSEL_SSEL4            0x00001000    /* SPISSEL4 Value */
-#define SPI_SLVSEL_SSEL5            0x00002000    /* SPISSEL5 Value */
-#define SPI_SLVSEL_SSEL6            0x00004000    /* SPISSEL6 Value */
-#define SPI_SLVSEL_SSEL7            0x00008000    /* SPISSEL7 Value */
-/* SPI_RWC */
-#define SPI_RWC_VALUE               0x0000FFFF    /* Received Word-Count */
-/* SPI_RWCR */
-#define SPI_RWCR_VALUE              0x0000FFFF    /* Received Word-Count Reload */
-/* SPI_TWC */
-#define SPI_TWC_VALUE               0x0000FFFF    /* Transmitted Word-Count */
-/* SPI_TWCR */
-#define SPI_TWCR_VALUE              0x0000FFFF    /* Transmitted Word-Count Reload */
-/* SPI_IMASK */
-#define SPI_IMSK_RUWM               0x00000002    /* Receive Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_TUWM               0x00000004    /* Transmit Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_ROM                0x00000010    /* Receive Over-Run Error Interrupt Mask */
-#define SPI_IMSK_TUM                0x00000020    /* Transmit Under-Run Error Interrupt Mask */
-#define SPI_IMSK_TCM                0x00000040    /* Transmit Collision Error Interrupt Mask */
-#define SPI_IMSK_MFM                0x00000080    /* Mode Fault Error Interrupt Mask */
-#define SPI_IMSK_RSM                0x00000100    /* Receive Start Interrupt Mask */
-#define SPI_IMSK_TSM                0x00000200    /* Transmit Start Interrupt Mask */
-#define SPI_IMSK_RFM                0x00000400    /* Receive Finish Interrupt Mask */
-#define SPI_IMSK_TFM                0x00000800    /* Transmit Finish Interrupt Mask */
-/* SPI_IMASKCL */
-#define SPI_IMSK_CLR_RUW            0x00000002    /* Receive Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_CLR_TUWM           0x00000004    /* Transmit Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_CLR_ROM            0x00000010    /* Receive Over-Run Error Interrupt Mask */
-#define SPI_IMSK_CLR_TUM            0x00000020    /* Transmit Under-Run Error Interrupt Mask */
-#define SPI_IMSK_CLR_TCM            0x00000040    /* Transmit Collision Error Interrupt Mask */
-#define SPI_IMSK_CLR_MFM            0x00000080    /* Mode Fault Error Interrupt Mask */
-#define SPI_IMSK_CLR_RSM            0x00000100    /* Receive Start Interrupt Mask */
-#define SPI_IMSK_CLR_TSM            0x00000200    /* Transmit Start Interrupt Mask */
-#define SPI_IMSK_CLR_RFM            0x00000400    /* Receive Finish Interrupt Mask */
-#define SPI_IMSK_CLR_TFM            0x00000800    /* Transmit Finish Interrupt Mask */
-/* SPI_IMASKST */
-#define SPI_IMSK_SET_RUWM           0x00000002    /* Receive Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_SET_TUWM           0x00000004    /* Transmit Urgent Water-Mark Interrupt Mask */
-#define SPI_IMSK_SET_ROM            0x00000010    /* Receive Over-Run Error Interrupt Mask */
-#define SPI_IMSK_SET_TUM            0x00000020    /* Transmit Under-Run Error Interrupt Mask */
-#define SPI_IMSK_SET_TCM            0x00000040    /* Transmit Collision Error Interrupt Mask */
-#define SPI_IMSK_SET_MFM            0x00000080    /* Mode Fault Error Interrupt Mask */
-#define SPI_IMSK_SET_RSM            0x00000100    /* Receive Start Interrupt Mask */
-#define SPI_IMSK_SET_TSM            0x00000200    /* Transmit Start Interrupt Mask */
-#define SPI_IMSK_SET_RFM            0x00000400    /* Receive Finish Interrupt Mask */
-#define SPI_IMSK_SET_TFM            0x00000800    /* Transmit Finish Interrupt Mask */
-/* SPI_STATUS */
-#define SPI_STAT_SPIF               0x00000001    /* SPI Finished */
-#define SPI_STAT_RUWM               0x00000002    /* Receive Urgent Water-Mark Breached */
-#define SPI_STAT_TUWM               0x00000004    /* Transmit Urgent Water-Mark Breached */
-#define SPI_STAT_ROE                0x00000010    /* Receive Over-Run Error Indication */
-#define SPI_STAT_TUE                0x00000020    /* Transmit Under-Run Error Indication */
-#define SPI_STAT_TCE                0x00000040    /* Transmit Collision Error Indication */
-#define SPI_STAT_MODF               0x00000080    /* Mode Fault Error Indication */
-#define SPI_STAT_RS                 0x00000100    /* Receive Start Indication */
-#define SPI_STAT_TS                 0x00000200    /* Transmit Start Indication */
-#define SPI_STAT_RF                 0x00000400    /* Receive Finish Indication */
-#define SPI_STAT_TF                 0x00000800    /* Transmit Finish Indication */
-#define SPI_STAT_RFS                0x00007000    /* SPI_RFIFO status */
-#define SPI_STAT_RFIFO_EMPTY        0x00000000    /* RFS: RFIFO Empty */
-#define SPI_STAT_RFIFO_25           0x00001000    /* RFS: RFIFO 25% Full */
-#define SPI_STAT_RFIFO_50           0x00002000    /* RFS: RFIFO 50% Full */
-#define SPI_STAT_RFIFO_75           0x00003000    /* RFS: RFIFO 75% Full */
-#define SPI_STAT_RFIFO_FULL         0x00004000    /* RFS: RFIFO Full */
-#define SPI_STAT_TFS                0x00070000    /* SPI_TFIFO status */
-#define SPI_STAT_TFIFO_FULL         0x00000000    /* TFS: TFIFO full */
-#define SPI_STAT_TFIFO_25           0x00010000    /* TFS: TFIFO 25% empty */
-#define SPI_STAT_TFIFO_50           0x00020000    /* TFS: TFIFO 50% empty */
-#define SPI_STAT_TFIFO_75           0x00030000    /* TFS: TFIFO 75% empty */
-#define SPI_STAT_TFIFO_EMPTY        0x00040000    /* TFS: TFIFO empty */
-#define SPI_STAT_FCS                0x00100000    /* Flow-Control Stall Indication */
-#define SPI_STAT_RFE                0x00400000    /* SPI_RFIFO Empty */
-#define SPI_STAT_TFF                0x00800000    /* SPI_TFIFO Full */
-/* SPI_ILAT */
-#define SPI_ILAT_RUWMI              0x00000002    /* Receive Urgent Water Mark Interrupt */
-#define SPI_ILAT_TUWMI              0x00000004    /* Transmit Urgent Water Mark Interrupt */
-#define SPI_ILAT_ROI                0x00000010    /* Receive Over-Run Error Indication */
-#define SPI_ILAT_TUI                0x00000020    /* Transmit Under-Run Error Indication */
-#define SPI_ILAT_TCI                0x00000040    /* Transmit Collision Error Indication */
-#define SPI_ILAT_MFI                0x00000080    /* Mode Fault Error Indication */
-#define SPI_ILAT_RSI                0x00000100    /* Receive Start Indication */
-#define SPI_ILAT_TSI                0x00000200    /* Transmit Start Indication */
-#define SPI_ILAT_RFI                0x00000400    /* Receive Finish Indication */
-#define SPI_ILAT_TFI                0x00000800    /* Transmit Finish Indication */
-/* SPI_ILATCL */
-#define SPI_ILAT_CLR_RUWMI          0x00000002    /* Receive Urgent Water Mark Interrupt */
-#define SPI_ILAT_CLR_TUWMI          0x00000004    /* Transmit Urgent Water Mark Interrupt */
-#define SPI_ILAT_CLR_ROI            0x00000010    /* Receive Over-Run Error Indication */
-#define SPI_ILAT_CLR_TUI            0x00000020    /* Transmit Under-Run Error Indication */
-#define SPI_ILAT_CLR_TCI            0x00000040    /* Transmit Collision Error Indication */
-#define SPI_ILAT_CLR_MFI            0x00000080    /* Mode Fault Error Indication */
-#define SPI_ILAT_CLR_RSI            0x00000100    /* Receive Start Indication */
-#define SPI_ILAT_CLR_TSI            0x00000200    /* Transmit Start Indication */
-#define SPI_ILAT_CLR_RFI            0x00000400    /* Receive Finish Indication */
-#define SPI_ILAT_CLR_TFI            0x00000800    /* Transmit Finish Indication */
-
-/*
- * adi spi3 registers layout
- */
-struct adi_spi_regs {
-	u32 revid;
-	u32 control;
-	u32 rx_control;
-	u32 tx_control;
-	u32 clock;
-	u32 delay;
-	u32 ssel;
-	u32 rwc;
-	u32 rwcr;
-	u32 twc;
-	u32 twcr;
-	u32 reserved0;
-	u32 emask;
-	u32 emaskcl;
-	u32 emaskst;
-	u32 reserved1;
-	u32 status;
-	u32 elat;
-	u32 elatcl;
-	u32 reserved2;
-	u32 rfifo;
-	u32 reserved3;
-	u32 tfifo;
-};
-
-#define MAX_CTRL_CS          8  /* cs in spi controller */
-
-/* device.platform_data for SSP controller devices */
-struct adi_spi3_master {
-	u16 num_chipselect;
-	u16 pin_req[7];
-};
-
-/* spi_board_info.controller_data for SPI slave devices,
- * copied to spi_device.platform_data ... mostly for dma tuning
- */
-struct adi_spi3_chip {
-	u32 control;
-	u16 cs_chg_udelay; /* Some devices require 16-bit delays */
-	u32 tx_dummy_val; /* tx value for rx only transfer */
-	bool enable_dma;
-};
-
-#endif /* _ADI_SPI3_H_ */
-- 
cgit v1.2.3


From cf85ec88701610f0de2e959ac447d9613462fbc7 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Sat, 9 Jun 2018 16:18:16 +0300
Subject: removed extra extern file_fdatawait_range

Jeff added this extern twice in commit a823e4589e68

Fixes: a823e4589e68 ("mm: add file_fdatawait_range and file_write_and_wait")
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5c91108846db..f140c11d35dd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2623,8 +2623,6 @@ static inline int filemap_fdatawait(struct address_space *mapping)
 
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
-extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
-						loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
-- 
cgit v1.2.3


From 420c0117db25db38b72b6230223f7a976d3070ea Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sun, 17 Jun 2018 19:02:04 +0200
Subject: dmaengine: pxa: use a dma slave map

In order to remove the specific knowledge of the dma mapping from PXA
drivers, add a default slave map for pxa architectures.

This won't impact MMP architecture, but is aimed only at all PXA boards.

This is the first step, and once all drivers are converted,
pxad_filter_fn() will be made static, and the DMA resources removed from
device.c.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/pxa_dma.c                 | 10 +++++++++-
 include/linux/platform_data/mmp_dma.h |  4 ++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c
index b53fb618bbf6..9505334f9c6e 100644
--- a/drivers/dma/pxa_dma.c
+++ b/drivers/dma/pxa_dma.c
@@ -179,6 +179,8 @@ static unsigned int pxad_drcmr(unsigned int line)
 	return 0x1000 + line * 4;
 }
 
+bool pxad_filter_fn(struct dma_chan *chan, void *param);
+
 /*
  * Debug fs
  */
@@ -1396,9 +1398,10 @@ static int pxad_probe(struct platform_device *op)
 {
 	struct pxad_device *pdev;
 	const struct of_device_id *of_id;
+	const struct dma_slave_map *slave_map = NULL;
 	struct mmp_dma_platdata *pdata = dev_get_platdata(&op->dev);
 	struct resource *iores;
-	int ret, dma_channels = 0, nb_requestors = 0;
+	int ret, dma_channels = 0, nb_requestors = 0, slave_map_cnt = 0;
 	const enum dma_slave_buswidth widths =
 		DMA_SLAVE_BUSWIDTH_1_BYTE   | DMA_SLAVE_BUSWIDTH_2_BYTES |
 		DMA_SLAVE_BUSWIDTH_4_BYTES;
@@ -1429,6 +1432,8 @@ static int pxad_probe(struct platform_device *op)
 	} else if (pdata && pdata->dma_channels) {
 		dma_channels = pdata->dma_channels;
 		nb_requestors = pdata->nb_requestors;
+		slave_map = pdata->slave_map;
+		slave_map_cnt = pdata->slave_map_cnt;
 	} else {
 		dma_channels = 32;	/* default 32 channel */
 	}
@@ -1440,6 +1445,9 @@ static int pxad_probe(struct platform_device *op)
 	pdev->slave.device_prep_dma_memcpy = pxad_prep_memcpy;
 	pdev->slave.device_prep_slave_sg = pxad_prep_slave_sg;
 	pdev->slave.device_prep_dma_cyclic = pxad_prep_dma_cyclic;
+	pdev->slave.filter.map = slave_map;
+	pdev->slave.filter.mapcnt = slave_map_cnt;
+	pdev->slave.filter.fn = pxad_filter_fn;
 
 	pdev->slave.copy_align = PDMA_ALIGNMENT;
 	pdev->slave.src_addr_widths = widths;
diff --git a/include/linux/platform_data/mmp_dma.h b/include/linux/platform_data/mmp_dma.h
index d1397c8ed94e..6397b9c8149a 100644
--- a/include/linux/platform_data/mmp_dma.h
+++ b/include/linux/platform_data/mmp_dma.h
@@ -12,9 +12,13 @@
 #ifndef MMP_DMA_H
 #define MMP_DMA_H
 
+struct dma_slave_map;
+
 struct mmp_dma_platdata {
 	int dma_channels;
 	int nb_requestors;
+	int slave_map_cnt;
+	const struct dma_slave_map *slave_map;
 };
 
 #endif /* MMP_DMA_H */
-- 
cgit v1.2.3


From b6d1a17f4729e4fda5740a855da91d202db2c118 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sun, 17 Jun 2018 19:02:14 +0200
Subject: dmaengine: pxa: document pxad_param

Add some documentation for the pxad_param structure, and describe the
contract behind the minimal required priority of a DMA channel.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/pxa-dma.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h
index e56ec7af4fd7..9fc594f69eff 100644
--- a/include/linux/dma/pxa-dma.h
+++ b/include/linux/dma/pxa-dma.h
@@ -9,6 +9,15 @@ enum pxad_chan_prio {
 	PXAD_PRIO_LOWEST,
 };
 
+/**
+ * struct pxad_param - dma channel request parameters
+ * @drcmr: requestor line number
+ * @prio: minimal mandatory priority of the channel
+ *
+ * If a requested channel is granted, its priority will be at least @prio,
+ * ie. if PXAD_PRIO_LOW is required, the requested channel will be either
+ * PXAD_PRIO_LOW, PXAD_PRIO_NORMAL or PXAD_PRIO_HIGHEST.
+ */
 struct pxad_param {
 	unsigned int drcmr;
 	enum pxad_chan_prio prio;
-- 
cgit v1.2.3


From cd31b80736852d34bc1072f3e579a6fd73a244e7 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Sun, 17 Jun 2018 19:02:17 +0200
Subject: ARM: pxa: change SSP DMA channels allocation

Now the dma_slave_map is available for PXA architecture, switch the SSP
device to it.

This specifically means that :
- for platform data based machines, the DMA requestor channels are
  extracted from the slave map, where pxa-ssp-dai.<N> is a 1-1 match to
  ssp.<N>, and the channels are either "rx" or "tx".

- for device tree platforms, the dma node should be hooked into the
  pxa2xx-ac97 or pxa-ssp-dai node.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Daniel Mack <daniel@zonque.org>
---
 arch/arm/plat-pxa/ssp.c    | 47 ----------------------------------------------
 include/linux/pxa2xx_ssp.h |  2 --
 sound/soc/pxa/pxa-ssp.c    |  5 ++---
 3 files changed, 2 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-pxa/ssp.c b/arch/arm/plat-pxa/ssp.c
index ba13f793fbce..ed36dcab80f1 100644
--- a/arch/arm/plat-pxa/ssp.c
+++ b/arch/arm/plat-pxa/ssp.c
@@ -127,53 +127,6 @@ static int pxa_ssp_probe(struct platform_device *pdev)
 	if (IS_ERR(ssp->clk))
 		return PTR_ERR(ssp->clk);
 
-	if (dev->of_node) {
-		struct of_phandle_args dma_spec;
-		struct device_node *np = dev->of_node;
-		int ret;
-
-		/*
-		 * FIXME: we should allocate the DMA channel from this
-		 * context and pass the channel down to the ssp users.
-		 * For now, we lookup the rx and tx indices manually
-		 */
-
-		/* rx */
-		ret = of_parse_phandle_with_args(np, "dmas", "#dma-cells",
-						 0, &dma_spec);
-
-		if (ret) {
-			dev_err(dev, "Can't parse dmas property\n");
-			return -ENODEV;
-		}
-		ssp->drcmr_rx = dma_spec.args[0];
-		of_node_put(dma_spec.np);
-
-		/* tx */
-		ret = of_parse_phandle_with_args(np, "dmas", "#dma-cells",
-						 1, &dma_spec);
-		if (ret) {
-			dev_err(dev, "Can't parse dmas property\n");
-			return -ENODEV;
-		}
-		ssp->drcmr_tx = dma_spec.args[0];
-		of_node_put(dma_spec.np);
-	} else {
-		res = platform_get_resource(pdev, IORESOURCE_DMA, 0);
-		if (res == NULL) {
-			dev_err(dev, "no SSP RX DRCMR defined\n");
-			return -ENODEV;
-		}
-		ssp->drcmr_rx = res->start;
-
-		res = platform_get_resource(pdev, IORESOURCE_DMA, 1);
-		if (res == NULL) {
-			dev_err(dev, "no SSP TX DRCMR defined\n");
-			return -ENODEV;
-		}
-		ssp->drcmr_tx = res->start;
-	}
-
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL) {
 		dev_err(dev, "no memory resource defined\n");
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 8461b18e4608..03a7ca46735b 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -212,8 +212,6 @@ struct ssp_device {
 	int		type;
 	int		use_count;
 	int		irq;
-	int		drcmr_rx;
-	int		drcmr_tx;
 
 	struct device_node	*of_node;
 };
diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c
index 6fc986080130..0b441338bdd4 100644
--- a/sound/soc/pxa/pxa-ssp.c
+++ b/sound/soc/pxa/pxa-ssp.c
@@ -105,9 +105,8 @@ static int pxa_ssp_startup(struct snd_pcm_substream *substream,
 	dma = kzalloc(sizeof(struct snd_dmaengine_dai_dma_data), GFP_KERNEL);
 	if (!dma)
 		return -ENOMEM;
-
-	dma->filter_data = substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
-				&ssp->drcmr_tx : &ssp->drcmr_rx;
+	dma->chan_name = substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
+		"tx" : "rx";
 
 	snd_soc_dai_set_dma_data(cpu_dai, substream, dma);
 
-- 
cgit v1.2.3


From b8042b3da925f390c1482bf9dc0898dc0b3ea7b5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Jun 2018 22:39:29 +0200
Subject: ieee80211: bump IEEE80211_MAX_AMPDU_BUF to support HE

Bump the IEEE80211_MAX_AMPDU_BUF size to 0x100 for HE support
and - for now - use IEEE80211_MAX_AMPDU_BUF_HT everywhere.

This is derived from my internal patch, parts of which Luca
had sent upstream.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/realtek/rtlwifi/base.c |  2 +-
 drivers/staging/rtl8188eu/include/wifi.h    |  1 -
 drivers/staging/rtl8712/wifi.h              |  1 -
 drivers/staging/rtl8723bs/include/wifi.h    |  1 -
 drivers/staging/rtlwifi/base.c              |  2 +-
 include/linux/ieee80211.h                   | 10 ++++++----
 net/mac80211/agg-rx.c                       |  4 ++--
 net/mac80211/agg-tx.c                       |  2 +-
 net/mac80211/ht.c                           |  2 +-
 net/mac80211/main.c                         |  4 ++--
 10 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 39c817eddd78..31bd6f714052 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -1904,7 +1904,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv)
 		 reject_agg, ctrl_agg_size, agg_size);
 
 	rtlpriv->hw->max_rx_aggregation_subframes =
-		(ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF);
+		(ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT);
 }
 EXPORT_SYMBOL(rtl_rx_ampdu_apply);
 
diff --git a/drivers/staging/rtl8188eu/include/wifi.h b/drivers/staging/rtl8188eu/include/wifi.h
index 084a246eec19..6790b7c8cfb1 100644
--- a/drivers/staging/rtl8188eu/include/wifi.h
+++ b/drivers/staging/rtl8188eu/include/wifi.h
@@ -575,7 +575,6 @@ enum ht_cap_ampdu_factor {
  * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2)
  */
 #define IEEE80211_MIN_AMPDU_BUF 0x8
-#define IEEE80211_MAX_AMPDU_BUF 0x40
 
 
 #define OP_MODE_PURE                    0
diff --git a/drivers/staging/rtl8712/wifi.h b/drivers/staging/rtl8712/wifi.h
index 0ed2f44ab4e9..00a4302e9983 100644
--- a/drivers/staging/rtl8712/wifi.h
+++ b/drivers/staging/rtl8712/wifi.h
@@ -574,7 +574,6 @@ struct ieee80211_ht_addt_info {
  * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2)
  */
 #define IEEE80211_MIN_AMPDU_BUF 0x8
-#define IEEE80211_MAX_AMPDU_BUF 0x40
 
 
 /* Spatial Multiplexing Power Save Modes */
diff --git a/drivers/staging/rtl8723bs/include/wifi.h b/drivers/staging/rtl8723bs/include/wifi.h
index 08bc79840b23..559bf2606fb7 100644
--- a/drivers/staging/rtl8723bs/include/wifi.h
+++ b/drivers/staging/rtl8723bs/include/wifi.h
@@ -799,7 +799,6 @@ enum HT_CAP_AMPDU_FACTOR {
  * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2)
  */
 #define IEEE80211_MIN_AMPDU_BUF 0x8
-#define IEEE80211_MAX_AMPDU_BUF 0x40
 
 
 /* Spatial Multiplexing Power Save Modes */
diff --git a/drivers/staging/rtlwifi/base.c b/drivers/staging/rtlwifi/base.c
index e46e47d93d7d..094827c1879a 100644
--- a/drivers/staging/rtlwifi/base.c
+++ b/drivers/staging/rtlwifi/base.c
@@ -1838,7 +1838,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv)
 		 reject_agg, ctrl_agg_size, agg_size);
 
 	rtlpriv->hw->max_rx_aggregation_subframes =
-		(ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF);
+		(ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT);
 }
 
 /*********************************************************
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e6a6503bfa33..9c03a7d5e400 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1433,11 +1433,13 @@ struct ieee80211_ht_operation {
 #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800
 
 /*
- * A-PMDU buffer sizes
- * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2)
+ * A-MPDU buffer sizes
+ * According to HT size varies from 8 to 64 frames
+ * HE adds the ability to have up to 256 frames.
  */
-#define IEEE80211_MIN_AMPDU_BUF 0x8
-#define IEEE80211_MAX_AMPDU_BUF 0x40
+#define IEEE80211_MIN_AMPDU_BUF		0x8
+#define IEEE80211_MAX_AMPDU_BUF_HT	0x40
+#define IEEE80211_MAX_AMPDU_BUF		0x100
 
 
 /* Spatial Multiplexing Power Save Modes (for capability) */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index e83c19d4c292..3ffd853b483f 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -274,7 +274,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
 	/* XXX: check own ht delayed BA capability?? */
 	if (((ba_policy != 1) &&
 	     (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
-	    (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+	    (buf_size > IEEE80211_MAX_AMPDU_BUF_HT)) {
 		status = WLAN_STATUS_INVALID_QOS_PARAM;
 		ht_dbg_ratelimited(sta->sdata,
 				   "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
@@ -283,7 +283,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
 	}
 	/* determine default buffer size */
 	if (buf_size == 0)
-		buf_size = IEEE80211_MAX_AMPDU_BUF;
+		buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
 
 	/* make sure the size doesn't exceed the maximum supported by the hw */
 	if (buf_size > sta->sta.max_rx_aggregation_subframes)
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ac4295296514..86c6bc0432ba 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -514,7 +514,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	/* send AddBA request */
 	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
 				     tid_tx->dialog_token, params.ssn,
-				     IEEE80211_MAX_AMPDU_BUF,
+				     IEEE80211_MAX_AMPDU_BUF_HT,
 				     tid_tx->timeout);
 }
 
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 26a7ba3b698f..f849ea814993 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
 		    test_and_clear_bit(tid,
 				       sta->ampdu_mlme.tid_rx_manage_offl))
 			___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
-							 IEEE80211_MAX_AMPDU_BUF,
+							 IEEE80211_MAX_AMPDU_BUF_HT,
 							 false, true);
 
 		if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index a6f8e3a646d4..070f77862014 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -597,8 +597,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	local->hw.queues = 1;
 	local->hw.max_rates = 1;
 	local->hw.max_report_rates = 0;
-	local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
-	local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+	local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
+	local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
 	local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE;
 	local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
 	local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
-- 
cgit v1.2.3


From f7859590d97614815b35a755c8213dfb8f2766bd Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 5 Jun 2018 19:20:39 -0400
Subject: audit: eliminate audit_enabled magic number comparison

Remove comparison of audit_enabled to magic numbers outside of audit.

Related: https://github.com/linux-audit/audit-kernel/issues/86

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 drivers/tty/tty_audit.c      | 2 +-
 include/linux/audit.h        | 5 ++++-
 include/net/xfrm.h           | 2 +-
 kernel/audit.c               | 3 ---
 net/netfilter/xt_AUDIT.c     | 2 +-
 net/netlabel/netlabel_user.c | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index e30aa6bf9ff9..50f567b6a66e 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -92,7 +92,7 @@ static void tty_audit_buf_push(struct tty_audit_buf *buf)
 {
 	if (buf->valid == 0)
 		return;
-	if (audit_enabled == 0) {
+	if (audit_enabled == AUDIT_OFF) {
 		buf->valid = 0;
 		return;
 	}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 69c78477590b..9334fbef7bae 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,9 @@ struct filename;
 
 extern void audit_log_session_info(struct audit_buffer *ab);
 
+#define AUDIT_OFF	0
+#define AUDIT_ON	1
+#define AUDIT_LOCKED	2
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
@@ -202,7 +205,7 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
 static inline void audit_log_task_info(struct audit_buffer *ab,
 				       struct task_struct *tsk)
 { }
-#define audit_enabled 0
+#define audit_enabled AUDIT_OFF
 #endif /* CONFIG_AUDIT */
 
 #ifdef CONFIG_AUDIT_COMPAT_GENERIC
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 557122846e0e..f7f297727ed8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -735,7 +735,7 @@ static inline struct audit_buffer *xfrm_audit_start(const char *op)
 {
 	struct audit_buffer *audit_buf = NULL;
 
-	if (audit_enabled == 0)
+	if (audit_enabled == AUDIT_OFF)
 		return NULL;
 	audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
 				    AUDIT_MAC_IPSEC_EVENT);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5c0a1d7b0c7b..0f3222e4edde 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -83,9 +83,6 @@
 #define AUDIT_INITIALIZED	1
 static int	audit_initialized;
 
-#define AUDIT_OFF	0
-#define AUDIT_ON	1
-#define AUDIT_LOCKED	2
 u32		audit_enabled = AUDIT_OFF;
 bool		audit_ever_enabled = !!AUDIT_OFF;
 
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index f368ee6741db..af883f1b64f9 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -72,7 +72,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	struct audit_buffer *ab;
 	int fam = -1;
 
-	if (audit_enabled == 0)
+	if (audit_enabled == AUDIT_OFF)
 		goto errout;
 	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
 	if (ab == NULL)
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 2f328af91a52..4676f5bb16ae 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -101,7 +101,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 	char *secctx;
 	u32 secctx_len;
 
-	if (audit_enabled == 0)
+	if (audit_enabled == AUDIT_OFF)
 		return NULL;
 
 	audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type);
-- 
cgit v1.2.3


From 9fba738a53dda20e748d6ee240b6c017c8146b4b Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Tue, 19 Jun 2018 16:41:41 +0200
Subject: clk: add duty cycle support

Add the possibility to apply and query the clock signal duty cycle ratio.

This is useful when the duty cycle of the clock signal depends on some
other parameters controlled by the clock framework.

For example, the duty cycle of a divider may depends on the raw divider
setting (ratio = N / div) , which is controlled by the CCF. In such case,
going through the pwm framework to control the duty cycle ratio of this
clock would be a burden.

A clock provider is not required to implement the operation to set and get
the duty cycle. If it does not implement .get_duty_cycle(), the ratio is
assumed to be 50%.

This change also adds a new flag, CLK_DUTY_CYCLE_PARENT. This flag should
be used to indicate that a clock, such as gates and muxes, may inherit
the duty cycle ratio of its parent clock. If a clock does not provide a
get_duty_cycle() callback and has CLK_DUTY_CYCLE_PARENT, then the call
will be directly forwarded to its parent clock, if any. For
set_duty_cycle(), the clock should also have CLK_SET_RATE_PARENT for the
call to be forwarded

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Link: lkml.kernel.org/r/20180619144141.8506-1-jbrunet@baylibre.com
---
 drivers/clk/clk.c            | 199 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/clk-provider.h |  26 ++++++
 include/linux/clk.h          |  33 +++++++
 include/trace/events/clk.h   |  36 ++++++++
 4 files changed, 289 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 9760b526ca31..b0a2719d86f3 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -68,6 +68,7 @@ struct clk_core {
 	unsigned long		max_rate;
 	unsigned long		accuracy;
 	int			phase;
+	struct clk_duty		duty;
 	struct hlist_head	children;
 	struct hlist_node	child_node;
 	struct hlist_head	clks;
@@ -2402,6 +2403,172 @@ int clk_get_phase(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_get_phase);
 
+static void clk_core_reset_duty_cycle_nolock(struct clk_core *core)
+{
+	/* Assume a default value of 50% */
+	core->duty.num = 1;
+	core->duty.den = 2;
+}
+
+static int clk_core_update_duty_cycle_parent_nolock(struct clk_core *core);
+
+static int clk_core_update_duty_cycle_nolock(struct clk_core *core)
+{
+	struct clk_duty *duty = &core->duty;
+	int ret = 0;
+
+	if (!core->ops->get_duty_cycle)
+		return clk_core_update_duty_cycle_parent_nolock(core);
+
+	ret = core->ops->get_duty_cycle(core->hw, duty);
+	if (ret)
+		goto reset;
+
+	/* Don't trust the clock provider too much */
+	if (duty->den == 0 || duty->num > duty->den) {
+		ret = -EINVAL;
+		goto reset;
+	}
+
+	return 0;
+
+reset:
+	clk_core_reset_duty_cycle_nolock(core);
+	return ret;
+}
+
+static int clk_core_update_duty_cycle_parent_nolock(struct clk_core *core)
+{
+	int ret = 0;
+
+	if (core->parent &&
+	    core->flags & CLK_DUTY_CYCLE_PARENT) {
+		ret = clk_core_update_duty_cycle_nolock(core->parent);
+		memcpy(&core->duty, &core->parent->duty, sizeof(core->duty));
+	} else {
+		clk_core_reset_duty_cycle_nolock(core);
+	}
+
+	return ret;
+}
+
+static int clk_core_set_duty_cycle_parent_nolock(struct clk_core *core,
+						 struct clk_duty *duty);
+
+static int clk_core_set_duty_cycle_nolock(struct clk_core *core,
+					  struct clk_duty *duty)
+{
+	int ret;
+
+	lockdep_assert_held(&prepare_lock);
+
+	if (clk_core_rate_is_protected(core))
+		return -EBUSY;
+
+	trace_clk_set_duty_cycle(core, duty);
+
+	if (!core->ops->set_duty_cycle)
+		return clk_core_set_duty_cycle_parent_nolock(core, duty);
+
+	ret = core->ops->set_duty_cycle(core->hw, duty);
+	if (!ret)
+		memcpy(&core->duty, duty, sizeof(*duty));
+
+	trace_clk_set_duty_cycle_complete(core, duty);
+
+	return ret;
+}
+
+static int clk_core_set_duty_cycle_parent_nolock(struct clk_core *core,
+						 struct clk_duty *duty)
+{
+	int ret = 0;
+
+	if (core->parent &&
+	    core->flags & (CLK_DUTY_CYCLE_PARENT | CLK_SET_RATE_PARENT)) {
+		ret = clk_core_set_duty_cycle_nolock(core->parent, duty);
+		memcpy(&core->duty, &core->parent->duty, sizeof(core->duty));
+	}
+
+	return ret;
+}
+
+/**
+ * clk_set_duty_cycle - adjust the duty cycle ratio of a clock signal
+ * @clk: clock signal source
+ * @num: numerator of the duty cycle ratio to be applied
+ * @den: denominator of the duty cycle ratio to be applied
+ *
+ * Apply the duty cycle ratio if the ratio is valid and the clock can
+ * perform this operation
+ *
+ * Returns (0) on success, a negative errno otherwise.
+ */
+int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den)
+{
+	int ret;
+	struct clk_duty duty;
+
+	if (!clk)
+		return 0;
+
+	/* sanity check the ratio */
+	if (den == 0 || num > den)
+		return -EINVAL;
+
+	duty.num = num;
+	duty.den = den;
+
+	clk_prepare_lock();
+
+	if (clk->exclusive_count)
+		clk_core_rate_unprotect(clk->core);
+
+	ret = clk_core_set_duty_cycle_nolock(clk->core, &duty);
+
+	if (clk->exclusive_count)
+		clk_core_rate_protect(clk->core);
+
+	clk_prepare_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(clk_set_duty_cycle);
+
+static int clk_core_get_scaled_duty_cycle(struct clk_core *core,
+					  unsigned int scale)
+{
+	struct clk_duty *duty = &core->duty;
+	int ret;
+
+	clk_prepare_lock();
+
+	ret = clk_core_update_duty_cycle_nolock(core);
+	if (!ret)
+		ret = mult_frac(scale, duty->num, duty->den);
+
+	clk_prepare_unlock();
+
+	return ret;
+}
+
+/**
+ * clk_get_scaled_duty_cycle - return the duty cycle ratio of a clock signal
+ * @clk: clock signal source
+ * @scale: scaling factor to be applied to represent the ratio as an integer
+ *
+ * Returns the duty cycle ratio of a clock node multiplied by the provided
+ * scaling factor, or negative errno on error.
+ */
+int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale)
+{
+	if (!clk)
+		return 0;
+
+	return clk_core_get_scaled_duty_cycle(clk->core, scale);
+}
+EXPORT_SYMBOL_GPL(clk_get_scaled_duty_cycle);
+
 /**
  * clk_is_match - check if two clk's point to the same hardware clock
  * @p: clk compared against q
@@ -2455,12 +2622,13 @@ static void clk_summary_show_one(struct seq_file *s, struct clk_core *c,
 	if (!c)
 		return;
 
-	seq_printf(s, "%*s%-*s %7d %8d %8d %11lu %10lu %-3d\n",
+	seq_printf(s, "%*s%-*s %7d %8d %8d %11lu %10lu %5d %6d\n",
 		   level * 3 + 1, "",
 		   30 - level * 3, c->name,
 		   c->enable_count, c->prepare_count, c->protect_count,
 		   clk_core_get_rate(c), clk_core_get_accuracy(c),
-		   clk_core_get_phase(c));
+		   clk_core_get_phase(c),
+		   clk_core_get_scaled_duty_cycle(c, 100000));
 }
 
 static void clk_summary_show_subtree(struct seq_file *s, struct clk_core *c,
@@ -2482,9 +2650,9 @@ static int clk_summary_show(struct seq_file *s, void *data)
 	struct clk_core *c;
 	struct hlist_head **lists = (struct hlist_head **)s->private;
 
-	seq_puts(s, "                                 enable  prepare  protect                               \n");
-	seq_puts(s, "   clock                          count    count    count        rate   accuracy   phase\n");
-	seq_puts(s, "----------------------------------------------------------------------------------------\n");
+	seq_puts(s, "                                 enable  prepare  protect                                duty\n");
+	seq_puts(s, "   clock                          count    count    count        rate   accuracy phase  cycle\n");
+	seq_puts(s, "---------------------------------------------------------------------------------------------\n");
 
 	clk_prepare_lock();
 
@@ -2511,6 +2679,8 @@ static void clk_dump_one(struct seq_file *s, struct clk_core *c, int level)
 	seq_printf(s, "\"rate\": %lu,", clk_core_get_rate(c));
 	seq_printf(s, "\"accuracy\": %lu,", clk_core_get_accuracy(c));
 	seq_printf(s, "\"phase\": %d", clk_core_get_phase(c));
+	seq_printf(s, "\"duty_cycle\": %u",
+		   clk_core_get_scaled_duty_cycle(c, 100000));
 }
 
 static void clk_dump_subtree(struct seq_file *s, struct clk_core *c, int level)
@@ -2572,6 +2742,7 @@ static const struct {
 	ENTRY(CLK_SET_RATE_UNGATE),
 	ENTRY(CLK_IS_CRITICAL),
 	ENTRY(CLK_OPS_PARENT_ENABLE),
+	ENTRY(CLK_DUTY_CYCLE_PARENT),
 #undef ENTRY
 };
 
@@ -2610,6 +2781,17 @@ static int possible_parents_show(struct seq_file *s, void *data)
 }
 DEFINE_SHOW_ATTRIBUTE(possible_parents);
 
+static int clk_duty_cycle_show(struct seq_file *s, void *data)
+{
+	struct clk_core *core = s->private;
+	struct clk_duty *duty = &core->duty;
+
+	seq_printf(s, "%u/%u\n", duty->num, duty->den);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(clk_duty_cycle);
+
 static void clk_debug_create_one(struct clk_core *core, struct dentry *pdentry)
 {
 	struct dentry *root;
@@ -2628,6 +2810,8 @@ static void clk_debug_create_one(struct clk_core *core, struct dentry *pdentry)
 	debugfs_create_u32("clk_enable_count", 0444, root, &core->enable_count);
 	debugfs_create_u32("clk_protect_count", 0444, root, &core->protect_count);
 	debugfs_create_u32("clk_notifier_count", 0444, root, &core->notifier_count);
+	debugfs_create_file("clk_duty_cycle", 0444, root, core,
+			    &clk_duty_cycle_fops);
 
 	if (core->num_parents > 1)
 		debugfs_create_file("clk_possible_parents", 0444, root, core,
@@ -2845,6 +3029,11 @@ static int __clk_core_init(struct clk_core *core)
 	else
 		core->phase = 0;
 
+	/*
+	 * Set clk's duty cycle.
+	 */
+	clk_core_update_duty_cycle_nolock(core);
+
 	/*
 	 * Set clk's rate.  The preferred method is to use .recalc_rate.  For
 	 * simple clocks and lazy developers the default fallback is to use the
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b7cfa037e593..08b1aa70a38d 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -38,6 +38,8 @@
 #define CLK_IS_CRITICAL		BIT(11) /* do not gate, ever */
 /* parents need enable during gate/ungate, set rate and re-parent */
 #define CLK_OPS_PARENT_ENABLE	BIT(12)
+/* duty cycle call may be forwarded to the parent clock */
+#define CLK_DUTY_CYCLE_PARENT	BIT(13)
 
 struct clk;
 struct clk_hw;
@@ -66,6 +68,17 @@ struct clk_rate_request {
 	struct clk_hw *best_parent_hw;
 };
 
+/**
+ * struct clk_duty - Struture encoding the duty cycle ratio of a clock
+ *
+ * @num:	Numerator of the duty cycle ratio
+ * @den:	Denominator of the duty cycle ratio
+ */
+struct clk_duty {
+	unsigned int num;
+	unsigned int den;
+};
+
 /**
  * struct clk_ops -  Callback operations for hardware clocks; these are to
  * be provided by the clock implementation, and will be called by drivers
@@ -169,6 +182,15 @@ struct clk_rate_request {
  *		by the second argument. Valid values for degrees are
  *		0-359. Return 0 on success, otherwise -EERROR.
  *
+ * @get_duty_cycle: Queries the hardware to get the current duty cycle ratio
+ *              of a clock. Returned values denominator cannot be 0 and must be
+ *              superior or equal to the numerator.
+ *
+ * @set_duty_cycle: Apply the duty cycle ratio to this clock signal specified by
+ *              the numerator (2nd argurment) and denominator (3rd  argument).
+ *              Argument must be a valid ratio (denominator > 0
+ *              and >= numerator) Return 0 on success, otherwise -EERROR.
+ *
  * @init:	Perform platform-specific initialization magic.
  *		This is not not used by any of the basic clock types.
  *		Please consider other ways of solving initialization problems
@@ -218,6 +240,10 @@ struct clk_ops {
 					   unsigned long parent_accuracy);
 	int		(*get_phase)(struct clk_hw *hw);
 	int		(*set_phase)(struct clk_hw *hw, int degrees);
+	int		(*get_duty_cycle)(struct clk_hw *hw,
+					  struct clk_duty *duty);
+	int		(*set_duty_cycle)(struct clk_hw *hw,
+					  struct clk_duty *duty);
 	void		(*init)(struct clk_hw *hw);
 	void		(*debug_init)(struct clk_hw *hw, struct dentry *dentry);
 };
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 0dbd0885b2c2..4f750c481b82 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -141,6 +141,27 @@ int clk_set_phase(struct clk *clk, int degrees);
  */
 int clk_get_phase(struct clk *clk);
 
+/**
+ * clk_set_duty_cycle - adjust the duty cycle ratio of a clock signal
+ * @clk: clock signal source
+ * @num: numerator of the duty cycle ratio to be applied
+ * @den: denominator of the duty cycle ratio to be applied
+ *
+ * Adjust the duty cycle of a clock signal by the specified ratio. Returns 0 on
+ * success, -EERROR otherwise.
+ */
+int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den);
+
+/**
+ * clk_get_duty_cycle - return the duty cycle ratio of a clock signal
+ * @clk: clock signal source
+ * @scale: scaling factor to be applied to represent the ratio as an integer
+ *
+ * Returns the duty cycle ratio multiplied by the scale provided, otherwise
+ * returns -EERROR.
+ */
+int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale);
+
 /**
  * clk_is_match - check if two clk's point to the same hardware clock
  * @p: clk compared against q
@@ -183,6 +204,18 @@ static inline long clk_get_phase(struct clk *clk)
 	return -ENOTSUPP;
 }
 
+static inline int clk_set_duty_cycle(struct clk *clk, unsigned int num,
+				     unsigned int den)
+{
+	return -ENOTSUPP;
+}
+
+static inline unsigned int clk_get_scaled_duty_cycle(struct clk *clk,
+						     unsigned int scale)
+{
+	return 0;
+}
+
 static inline bool clk_is_match(const struct clk *p, const struct clk *q)
 {
 	return p == q;
diff --git a/include/trace/events/clk.h b/include/trace/events/clk.h
index 2cd449328aee..9004ffff7f32 100644
--- a/include/trace/events/clk.h
+++ b/include/trace/events/clk.h
@@ -192,6 +192,42 @@ DEFINE_EVENT(clk_phase, clk_set_phase_complete,
 	TP_ARGS(core, phase)
 );
 
+DECLARE_EVENT_CLASS(clk_duty_cycle,
+
+	TP_PROTO(struct clk_core *core, struct clk_duty *duty),
+
+	TP_ARGS(core, duty),
+
+	TP_STRUCT__entry(
+		__string(        name,           core->name              )
+		__field( unsigned int,           num                     )
+		__field( unsigned int,           den                     )
+	),
+
+	TP_fast_assign(
+		__assign_str(name, core->name);
+		__entry->num = duty->num;
+		__entry->den = duty->den;
+	),
+
+	TP_printk("%s %u/%u", __get_str(name), (unsigned int)__entry->num,
+		  (unsigned int)__entry->den)
+);
+
+DEFINE_EVENT(clk_duty_cycle, clk_set_duty_cycle,
+
+	TP_PROTO(struct clk_core *core, struct clk_duty *duty),
+
+	TP_ARGS(core, duty)
+);
+
+DEFINE_EVENT(clk_duty_cycle, clk_set_duty_cycle_complete,
+
+	TP_PROTO(struct clk_core *core, struct clk_duty *duty),
+
+	TP_ARGS(core, duty)
+);
+
 #endif /* _TRACE_CLK_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 19e0c58f6552638c86395f0717210326fdf14fd2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 19 Jun 2018 15:10:56 -0700
Subject: iomap: generic inline data handling

Add generic inline data handling by adding a pointer to the inline data
region to struct iomap.  When handling a buffered IOMAP_INLINE write,
iomap_write_begin will copy the current inline data from the inline data
region into the page cache, and iomap_write_end will copy the changes in
the page cache back to the inline data region.

This doesn't cover inline data reads and direct I/O yet because so far,
we have no users.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
[hch: small cleanups to better fit in with other iomap work]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 62 +++++++++++++++++++++++++++++++++++++++++++++------
 include/linux/iomap.h |  1 +
 2 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index 9c454459a1e9..4aecd7c5dbd8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -103,6 +103,26 @@ iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static void
+iomap_read_inline_data(struct inode *inode, struct page *page,
+		struct iomap *iomap)
+{
+	size_t size = i_size_read(inode);
+	void *addr;
+
+	if (PageUptodate(page))
+		return;
+
+	BUG_ON(page->index);
+	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	addr = kmap_atomic(page);
+	memcpy(addr, iomap->inline_data, size);
+	memset(addr + size, 0, PAGE_SIZE - size);
+	kunmap_atomic(addr);
+	SetPageUptodate(page);
+}
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -133,7 +153,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	if (!page)
 		return -ENOMEM;
 
-	status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	if (iomap->type == IOMAP_INLINE)
+		iomap_read_inline_data(inode, page, iomap);
+	else
+		status = __block_write_begin_int(page, pos, len, NULL, iomap);
+
 	if (unlikely(status)) {
 		unlock_page(page);
 		put_page(page);
@@ -146,14 +170,37 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	return status;
 }
 
+static int
+iomap_write_end_inline(struct inode *inode, struct page *page,
+		struct iomap *iomap, loff_t pos, unsigned copied)
+{
+	void *addr;
+
+	WARN_ON_ONCE(!PageUptodate(page));
+	BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	addr = kmap_atomic(page);
+	memcpy(iomap->inline_data + pos, addr + pos, copied);
+	kunmap_atomic(addr);
+
+	mark_inode_dirty(inode);
+	__generic_write_end(inode, pos, copied, page);
+	return copied;
+}
+
 static int
 iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-		unsigned copied, struct page *page)
+		unsigned copied, struct page *page, struct iomap *iomap)
 {
 	int ret;
 
-	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
-			copied, page, NULL);
+	if (iomap->type == IOMAP_INLINE) {
+		ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
+	} else {
+		ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+				copied, page, NULL);
+	}
+
 	if (ret < len)
 		iomap_write_failed(inode, pos, len);
 	return ret;
@@ -208,7 +255,8 @@ again:
 
 		flush_dcache_page(page);
 
-		status = iomap_write_end(inode, pos, bytes, copied, page);
+		status = iomap_write_end(inode, pos, bytes, copied, page,
+				iomap);
 		if (unlikely(status < 0))
 			break;
 		copied = status;
@@ -302,7 +350,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 		WARN_ON_ONCE(!PageUptodate(page));
 
-		status = iomap_write_end(inode, pos, bytes, bytes, page);
+		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 		if (unlikely(status <= 0)) {
 			if (WARN_ON_ONCE(status == 0))
 				return -EIO;
@@ -354,7 +402,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 	zero_user(page, offset, bytes);
 	mark_page_accessed(page);
 
-	return iomap_write_end(inode, pos, bytes, bytes, page);
+	return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 }
 
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a044a824da85..10d6cff7f69a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -55,6 +55,7 @@ struct iomap {
 	u16			flags;	/* flags for mapping */
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
+	void			*inline_data;
 };
 
 /*
-- 
cgit v1.2.3


From 63899c6f8851c32214b19390254fa1ae90b582df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Jun 2018 15:10:56 -0700
Subject: iomap: add a page_done callback

This will be used by gfs2 to attach data to transactions for the journaled
data mode.  But the concept is generic enough that we might be able to
use it for other purposes like encryption/integrity post-processing in the
future.

Based on a patch from Andreas Gruenbacher.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 3 +++
 include/linux/iomap.h | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index 4aecd7c5dbd8..a1f71e64ea49 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -201,6 +201,9 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 				copied, page, NULL);
 	}
 
+	if (iomap->page_done)
+		iomap->page_done(inode, pos, copied, page, iomap);
+
 	if (ret < len)
 		iomap_write_failed(inode, pos, len);
 	return ret;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 10d6cff7f69a..45f43865b0f0 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@ struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
 struct kiocb;
+struct page;
 struct vm_area_struct;
 struct vm_fault;
 
@@ -56,6 +57,14 @@ struct iomap {
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;
+
+	/*
+	 * Called when finished processing a page in the mapping returned in
+	 * this iomap.  At least for now this is only supported in the buffered
+	 * write path.
+	 */
+	void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
+			struct page *page, struct iomap *iomap);
 };
 
 /*
-- 
cgit v1.2.3


From e184fde6f3f5353040dd4b5637f823fc87436e55 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 19 Jun 2018 15:10:57 -0700
Subject: iomap: add private pointer to struct iomap

Add a private pointer to struct iomap to allow filesystems to pass data
from iomap_begin to iomap_end.  Will be used by gfs2 for passing on the
on-disk inode buffer head.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 include/linux/iomap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 45f43865b0f0..1f36523d448a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -57,6 +57,7 @@ struct iomap {
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;
+	void			*private; /* filesystem private */
 
 	/*
 	 * Called when finished processing a page in the mapping returned in
-- 
cgit v1.2.3


From 72b4daa241295440f98e80ae21294a67b27ca091 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Jun 2018 15:10:57 -0700
Subject: iomap: add an iomap-based readpage and readpages implementation

Simply use iomap_apply to iterate over the file and a submit a bio for
each non-uptodate but mapped region and zero everything else.  Note that
as-is this can not be used for file systems with a blocksize smaller than
the page size, but that support will be added later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 214 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/iomap.h |   3 +
 2 files changed, 216 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index a1f71e64ea49..4f10c6b1cf6d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016 Christoph Hellwig.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
@@ -123,6 +124,217 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
 	SetPageUptodate(page);
 }
 
+static void
+iomap_read_end_io(struct bio *bio)
+{
+	int error = blk_status_to_errno(bio->bi_status);
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i)
+		page_endio(bvec->bv_page, false, error);
+	bio_put(bio);
+}
+
+struct iomap_readpage_ctx {
+	struct page		*cur_page;
+	bool			cur_page_in_bio;
+	bool			is_readahead;
+	struct bio		*bio;
+	struct list_head	*pages;
+};
+
+static loff_t
+iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	struct page *page = ctx->cur_page;
+	unsigned poff = pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	bool is_contig = false;
+	sector_t sector;
+
+	/* we don't support blocksize < PAGE_SIZE quite yet. */
+	WARN_ON_ONCE(pos != page_offset(page));
+	WARN_ON_ONCE(plen != PAGE_SIZE);
+
+	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+		zero_user(page, poff, plen);
+		SetPageUptodate(page);
+		goto done;
+	}
+
+	ctx->cur_page_in_bio = true;
+
+	/*
+	 * Try to merge into a previous segment if we can.
+	 */
+	sector = iomap_sector(iomap, pos);
+	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+		if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+			goto done;
+		is_contig = true;
+	}
+
+	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
+		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+		if (ctx->bio)
+			submit_bio(ctx->bio);
+
+		if (ctx->is_readahead) /* same as readahead_gfp_mask */
+			gfp |= __GFP_NORETRY | __GFP_NOWARN;
+		ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+		ctx->bio->bi_opf = REQ_OP_READ;
+		if (ctx->is_readahead)
+			ctx->bio->bi_opf |= REQ_RAHEAD;
+		ctx->bio->bi_iter.bi_sector = sector;
+		bio_set_dev(ctx->bio, iomap->bdev);
+		ctx->bio->bi_end_io = iomap_read_end_io;
+	}
+
+	__bio_add_page(ctx->bio, page, plen, poff);
+done:
+	return plen;
+}
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = { .cur_page = page };
+	struct inode *inode = page->mapping->host;
+	unsigned poff;
+	loff_t ret;
+
+	WARN_ON_ONCE(page_has_buffers(page));
+
+	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
+		ret = iomap_apply(inode, page_offset(page) + poff,
+				PAGE_SIZE - poff, 0, ops, &ctx,
+				iomap_readpage_actor);
+		if (ret <= 0) {
+			WARN_ON_ONCE(ret == 0);
+			SetPageError(page);
+			break;
+		}
+	}
+
+	if (ctx.bio) {
+		submit_bio(ctx.bio);
+		WARN_ON_ONCE(!ctx.cur_page_in_bio);
+	} else {
+		WARN_ON_ONCE(ctx.cur_page_in_bio);
+		unlock_page(page);
+	}
+
+	/*
+	 * Just like mpage_readpages and block_read_full_page we always
+	 * return 0 and just mark the page as PageError on errors.  This
+	 * should be cleaned up all through the stack eventually.
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_readpage);
+
+static struct page *
+iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
+		loff_t length, loff_t *done)
+{
+	while (!list_empty(pages)) {
+		struct page *page = lru_to_page(pages);
+
+		if (page_offset(page) >= (u64)pos + length)
+			break;
+
+		list_del(&page->lru);
+		if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
+				GFP_NOFS))
+			return page;
+
+		/*
+		 * If we already have a page in the page cache at index we are
+		 * done.  Upper layers don't care if it is uptodate after the
+		 * readpages call itself as every page gets checked again once
+		 * actually needed.
+		 */
+		*done += PAGE_SIZE;
+		put_page(page);
+	}
+
+	return NULL;
+}
+
+static loff_t
+iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_readpage_ctx *ctx = data;
+	loff_t done, ret;
+
+	for (done = 0; done < length; done += ret) {
+		if (ctx->cur_page && ((pos + done) & (PAGE_SIZE - 1)) == 0) {
+			if (!ctx->cur_page_in_bio)
+				unlock_page(ctx->cur_page);
+			put_page(ctx->cur_page);
+			ctx->cur_page = NULL;
+		}
+		if (!ctx->cur_page) {
+			ctx->cur_page = iomap_next_page(inode, ctx->pages,
+					pos, length, &done);
+			if (!ctx->cur_page)
+				break;
+			ctx->cur_page_in_bio = false;
+		}
+		ret = iomap_readpage_actor(inode, pos + done, length - done,
+				ctx, iomap);
+	}
+
+	return done;
+}
+
+int
+iomap_readpages(struct address_space *mapping, struct list_head *pages,
+		unsigned nr_pages, const struct iomap_ops *ops)
+{
+	struct iomap_readpage_ctx ctx = {
+		.pages		= pages,
+		.is_readahead	= true,
+	};
+	loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
+	loff_t last = page_offset(list_entry(pages->next, struct page, lru));
+	loff_t length = last - pos + PAGE_SIZE, ret = 0;
+
+	while (length > 0) {
+		ret = iomap_apply(mapping->host, pos, length, 0, ops,
+				&ctx, iomap_readpages_actor);
+		if (ret <= 0) {
+			WARN_ON_ONCE(ret == 0);
+			goto done;
+		}
+		pos += ret;
+		length -= ret;
+	}
+	ret = 0;
+done:
+	if (ctx.bio)
+		submit_bio(ctx.bio);
+	if (ctx.cur_page) {
+		if (!ctx.cur_page_in_bio)
+			unlock_page(ctx.cur_page);
+		put_page(ctx.cur_page);
+	}
+
+	/*
+	 * Check that we didn't lose a page due to the arcance calling
+	 * conventions..
+	 */
+	WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_readpages);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1f36523d448a..30d314407f66 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -99,6 +99,9 @@ struct iomap_ops {
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
+int iomap_readpage(struct page *page, const struct iomap_ops *ops);
+int iomap_readpages(struct address_space *mapping, struct list_head *pages,
+		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
cgit v1.2.3


From 693ba15c9202fe0283404abe4066e1b986e284eb Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Tue, 12 Jun 2018 12:05:45 -0700
Subject: scsi: Remove percpu_ida

With its one user gone, remove the library code.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/percpu_ida.h |  83 ----------
 lib/Makefile               |   2 +-
 lib/percpu_ida.c           | 370 ---------------------------------------------
 3 files changed, 1 insertion(+), 454 deletions(-)
 delete mode 100644 include/linux/percpu_ida.h
 delete mode 100644 lib/percpu_ida.c

(limited to 'include/linux')

diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h
deleted file mode 100644
index 07d78e4653bc..000000000000
--- a/include/linux/percpu_ida.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PERCPU_IDA_H__
-#define __PERCPU_IDA_H__
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/spinlock_types.h>
-#include <linux/wait.h>
-#include <linux/cpumask.h>
-
-struct percpu_ida_cpu;
-
-struct percpu_ida {
-	/*
-	 * number of tags available to be allocated, as passed to
-	 * percpu_ida_init()
-	 */
-	unsigned			nr_tags;
-	unsigned			percpu_max_size;
-	unsigned			percpu_batch_size;
-
-	struct percpu_ida_cpu __percpu	*tag_cpu;
-
-	/*
-	 * Bitmap of cpus that (may) have tags on their percpu freelists:
-	 * steal_tags() uses this to decide when to steal tags, and which cpus
-	 * to try stealing from.
-	 *
-	 * It's ok for a freelist to be empty when its bit is set - steal_tags()
-	 * will just keep looking - but the bitmap _must_ be set whenever a
-	 * percpu freelist does have tags.
-	 */
-	cpumask_t			cpus_have_tags;
-
-	struct {
-		spinlock_t		lock;
-		/*
-		 * When we go to steal tags from another cpu (see steal_tags()),
-		 * we want to pick a cpu at random. Cycling through them every
-		 * time we steal is a bit easier and more or less equivalent:
-		 */
-		unsigned		cpu_last_stolen;
-
-		/* For sleeping on allocation failure */
-		wait_queue_head_t	wait;
-
-		/*
-		 * Global freelist - it's a stack where nr_free points to the
-		 * top
-		 */
-		unsigned		nr_free;
-		unsigned		*freelist;
-	} ____cacheline_aligned_in_smp;
-};
-
-/*
- * Number of tags we move between the percpu freelist and the global freelist at
- * a time
- */
-#define IDA_DEFAULT_PCPU_BATCH_MOVE	32U
-/* Max size of percpu freelist, */
-#define IDA_DEFAULT_PCPU_SIZE	((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
-
-int percpu_ida_alloc(struct percpu_ida *pool, int state);
-void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
-
-void percpu_ida_destroy(struct percpu_ida *pool);
-int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
-	unsigned long max_size, unsigned long batch_size);
-static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
-{
-	return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
-		IDA_DEFAULT_PCPU_BATCH_MOVE);
-}
-
-typedef int (*percpu_ida_cb)(unsigned, void *);
-int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
-	void *data);
-
-unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
-#endif /* __PERCPU_IDA_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index 956b320292fe..055420101965 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
+	 percpu-refcount.o rhashtable.o reciprocal_div.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
deleted file mode 100644
index 9bbd9c5d375a..000000000000
--- a/lib/percpu_ida.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Percpu IDA library
- *
- * Copyright (C) 2013 Datera, Inc. Kent Overstreet
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/mm.h>
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/sched/signal.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/percpu_ida.h>
-
-struct percpu_ida_cpu {
-	/*
-	 * Even though this is percpu, we need a lock for tag stealing by remote
-	 * CPUs:
-	 */
-	spinlock_t			lock;
-
-	/* nr_free/freelist form a stack of free IDs */
-	unsigned			nr_free;
-	unsigned			freelist[];
-};
-
-static inline void move_tags(unsigned *dst, unsigned *dst_nr,
-			     unsigned *src, unsigned *src_nr,
-			     unsigned nr)
-{
-	*src_nr -= nr;
-	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
-	*dst_nr += nr;
-}
-
-/*
- * Try to steal tags from a remote cpu's percpu freelist.
- *
- * We first check how many percpu freelists have tags
- *
- * Then we iterate through the cpus until we find some tags - we don't attempt
- * to find the "best" cpu to steal from, to keep cacheline bouncing to a
- * minimum.
- */
-static inline void steal_tags(struct percpu_ida *pool,
-			      struct percpu_ida_cpu *tags)
-{
-	unsigned cpus_have_tags, cpu = pool->cpu_last_stolen;
-	struct percpu_ida_cpu *remote;
-
-	for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
-	     cpus_have_tags; cpus_have_tags--) {
-		cpu = cpumask_next(cpu, &pool->cpus_have_tags);
-
-		if (cpu >= nr_cpu_ids) {
-			cpu = cpumask_first(&pool->cpus_have_tags);
-			if (cpu >= nr_cpu_ids)
-				BUG();
-		}
-
-		pool->cpu_last_stolen = cpu;
-		remote = per_cpu_ptr(pool->tag_cpu, cpu);
-
-		cpumask_clear_cpu(cpu, &pool->cpus_have_tags);
-
-		if (remote == tags)
-			continue;
-
-		spin_lock(&remote->lock);
-
-		if (remote->nr_free) {
-			memcpy(tags->freelist,
-			       remote->freelist,
-			       sizeof(unsigned) * remote->nr_free);
-
-			tags->nr_free = remote->nr_free;
-			remote->nr_free = 0;
-		}
-
-		spin_unlock(&remote->lock);
-
-		if (tags->nr_free)
-			break;
-	}
-}
-
-/*
- * Pop up to IDA_PCPU_BATCH_MOVE IDs off the global freelist, and push them onto
- * our percpu freelist:
- */
-static inline void alloc_global_tags(struct percpu_ida *pool,
-				     struct percpu_ida_cpu *tags)
-{
-	move_tags(tags->freelist, &tags->nr_free,
-		  pool->freelist, &pool->nr_free,
-		  min(pool->nr_free, pool->percpu_batch_size));
-}
-
-/**
- * percpu_ida_alloc - allocate a tag
- * @pool: pool to allocate from
- * @state: task state for prepare_to_wait
- *
- * Returns a tag - an integer in the range [0..nr_tags) (passed to
- * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
- *
- * Safe to be called from interrupt context (assuming it isn't passed
- * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course).
- *
- * @gfp indicates whether or not to wait until a free id is available (it's not
- * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep
- * however long it takes until another thread frees an id (same semantics as a
- * mempool).
- *
- * Will not fail if passed TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE.
- */
-int percpu_ida_alloc(struct percpu_ida *pool, int state)
-{
-	DEFINE_WAIT(wait);
-	struct percpu_ida_cpu *tags;
-	unsigned long flags;
-	int tag = -ENOSPC;
-
-	tags = raw_cpu_ptr(pool->tag_cpu);
-	spin_lock_irqsave(&tags->lock, flags);
-
-	/* Fastpath */
-	if (likely(tags->nr_free >= 0)) {
-		tag = tags->freelist[--tags->nr_free];
-		spin_unlock_irqrestore(&tags->lock, flags);
-		return tag;
-	}
-	spin_unlock_irqrestore(&tags->lock, flags);
-
-	while (1) {
-		spin_lock_irqsave(&pool->lock, flags);
-		tags = this_cpu_ptr(pool->tag_cpu);
-
-		/*
-		 * prepare_to_wait() must come before steal_tags(), in case
-		 * percpu_ida_free() on another cpu flips a bit in
-		 * cpus_have_tags
-		 *
-		 * global lock held and irqs disabled, don't need percpu lock
-		 */
-		if (state != TASK_RUNNING)
-			prepare_to_wait(&pool->wait, &wait, state);
-
-		if (!tags->nr_free)
-			alloc_global_tags(pool, tags);
-		if (!tags->nr_free)
-			steal_tags(pool, tags);
-
-		if (tags->nr_free) {
-			tag = tags->freelist[--tags->nr_free];
-			if (tags->nr_free)
-				cpumask_set_cpu(smp_processor_id(),
-						&pool->cpus_have_tags);
-		}
-
-		spin_unlock_irqrestore(&pool->lock, flags);
-
-		if (tag >= 0 || state == TASK_RUNNING)
-			break;
-
-		if (signal_pending_state(state, current)) {
-			tag = -ERESTARTSYS;
-			break;
-		}
-
-		schedule();
-	}
-	if (state != TASK_RUNNING)
-		finish_wait(&pool->wait, &wait);
-
-	return tag;
-}
-EXPORT_SYMBOL_GPL(percpu_ida_alloc);
-
-/**
- * percpu_ida_free - free a tag
- * @pool: pool @tag was allocated from
- * @tag: a tag previously allocated with percpu_ida_alloc()
- *
- * Safe to be called from interrupt context.
- */
-void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
-{
-	struct percpu_ida_cpu *tags;
-	unsigned long flags;
-	unsigned nr_free;
-
-	BUG_ON(tag >= pool->nr_tags);
-
-	tags = raw_cpu_ptr(pool->tag_cpu);
-
-	spin_lock_irqsave(&tags->lock, flags);
-	tags->freelist[tags->nr_free++] = tag;
-
-	nr_free = tags->nr_free;
-
-	if (nr_free == 1) {
-		cpumask_set_cpu(smp_processor_id(),
-				&pool->cpus_have_tags);
-		wake_up(&pool->wait);
-	}
-	spin_unlock_irqrestore(&tags->lock, flags);
-
-	if (nr_free == pool->percpu_max_size) {
-		spin_lock_irqsave(&pool->lock, flags);
-		spin_lock(&tags->lock);
-
-		if (tags->nr_free == pool->percpu_max_size) {
-			move_tags(pool->freelist, &pool->nr_free,
-				  tags->freelist, &tags->nr_free,
-				  pool->percpu_batch_size);
-
-			wake_up(&pool->wait);
-		}
-		spin_unlock(&tags->lock);
-		spin_unlock_irqrestore(&pool->lock, flags);
-	}
-}
-EXPORT_SYMBOL_GPL(percpu_ida_free);
-
-/**
- * percpu_ida_destroy - release a tag pool's resources
- * @pool: pool to free
- *
- * Frees the resources allocated by percpu_ida_init().
- */
-void percpu_ida_destroy(struct percpu_ida *pool)
-{
-	free_percpu(pool->tag_cpu);
-	free_pages((unsigned long) pool->freelist,
-		   get_order(pool->nr_tags * sizeof(unsigned)));
-}
-EXPORT_SYMBOL_GPL(percpu_ida_destroy);
-
-/**
- * percpu_ida_init - initialize a percpu tag pool
- * @pool: pool to initialize
- * @nr_tags: number of tags that will be available for allocation
- *
- * Initializes @pool so that it can be used to allocate tags - integers in the
- * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
- * preallocated array of tag structures.
- *
- * Allocation is percpu, but sharding is limited by nr_tags - for best
- * performance, the workload should not span more cpus than nr_tags / 128.
- */
-int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
-	unsigned long max_size, unsigned long batch_size)
-{
-	unsigned i, cpu, order;
-
-	memset(pool, 0, sizeof(*pool));
-
-	init_waitqueue_head(&pool->wait);
-	spin_lock_init(&pool->lock);
-	pool->nr_tags = nr_tags;
-	pool->percpu_max_size = max_size;
-	pool->percpu_batch_size = batch_size;
-
-	/* Guard against overflow */
-	if (nr_tags > (unsigned) INT_MAX + 1) {
-		pr_err("percpu_ida_init(): nr_tags too large\n");
-		return -EINVAL;
-	}
-
-	order = get_order(nr_tags * sizeof(unsigned));
-	pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
-	if (!pool->freelist)
-		return -ENOMEM;
-
-	for (i = 0; i < nr_tags; i++)
-		pool->freelist[i] = i;
-
-	pool->nr_free = nr_tags;
-
-	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
-				       pool->percpu_max_size * sizeof(unsigned),
-				       sizeof(unsigned));
-	if (!pool->tag_cpu)
-		goto err;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
-
-	return 0;
-err:
-	percpu_ida_destroy(pool);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(__percpu_ida_init);
-
-/**
- * percpu_ida_for_each_free - iterate free ids of a pool
- * @pool: pool to iterate
- * @fn: interate callback function
- * @data: parameter for @fn
- *
- * Note, this doesn't guarantee to iterate all free ids restrictly. Some free
- * ids might be missed, some might be iterated duplicated, and some might
- * be iterated and not free soon.
- */
-int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
-	void *data)
-{
-	unsigned long flags;
-	struct percpu_ida_cpu *remote;
-	unsigned cpu, i, err = 0;
-
-	for_each_possible_cpu(cpu) {
-		remote = per_cpu_ptr(pool->tag_cpu, cpu);
-		spin_lock_irqsave(&remote->lock, flags);
-		for (i = 0; i < remote->nr_free; i++) {
-			err = fn(remote->freelist[i], data);
-			if (err)
-				break;
-		}
-		spin_unlock_irqrestore(&remote->lock, flags);
-		if (err)
-			goto out;
-	}
-
-	spin_lock_irqsave(&pool->lock, flags);
-	for (i = 0; i < pool->nr_free; i++) {
-		err = fn(pool->freelist[i], data);
-		if (err)
-			break;
-	}
-	spin_unlock_irqrestore(&pool->lock, flags);
-out:
-	return err;
-}
-EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
-
-/**
- * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
- * @pool: pool related
- * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
- *
- * Note: this just returns a snapshot of free tags number.
- */
-unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
-{
-	struct percpu_ida_cpu *remote;
-	if (cpu == nr_cpu_ids)
-		return pool->nr_free;
-	remote = per_cpu_ptr(pool->tag_cpu, cpu);
-	return remote->nr_free;
-}
-EXPORT_SYMBOL_GPL(percpu_ida_free_tags);
-- 
cgit v1.2.3


From 2fa4a32613c9182b00e46872755b0662374424a7 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Thu, 10 May 2018 11:05:16 +0800
Subject: scsi: libsas: dynamically allocate and free ata host

Commit 2623c7a5f2 ("libata: add refcounting to ata_host") v4.17+ introduced
refcounting to ata_host and will increase or decrease the refcount when
adding or deleting transport ATA port.

Now the ata host for libsas is embedded in domain_device, and the ->kref
member is not initialized. Afer we add ata transport class, ata_host_get()
will be called when adding transport ATA port and a warning will be
triggered as below:

refcount_t: increment on 0; use-after-free.
WARNING: CPU: 2 PID: 103 at
lib/refcount.c:153 refcount_inc+0x40/0x48 ......  Call trace:
 refcount_inc+0x40/0x48
 ata_host_get+0x10/0x18
 ata_tport_add+0x40/0x120
 ata_sas_tport_add+0xc/0x14
 sas_ata_init+0x7c/0xc8
 sas_discover_domain+0x380/0x53c
 process_one_work+0x12c/0x288
 worker_thread+0x58/0x3f0
 kthread+0xfc/0x128
 ret_from_fork+0x10/0x18

And also when removing transport ATA port ata_host_put() will be called and
another similar warning will be triggered. If the refcount decreased to
zero, the ata host will be freed. But this ata host is only part of
domain_device, it cannot be freed directly.

So we have to change this embedded static ata host to a dynamically
allocated ata host and initialize the ->kref member. To use ata_host_get()
and ata_host_put() in libsas, we need to move the declaration of these
functions to the public libata.h and export them.

Fixes: b6240a4df018 ("scsi: libsas: add transport class for ATA devices")
Signed-off-by: Jason Yan <yanaijie@huawei.com>
CC: John Garry <john.garry@huawei.com>
CC: Taras Kondratiuk <takondra@cisco.com>
CC: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-core.c          |  3 +++
 drivers/ata/libata.h               |  2 --
 drivers/scsi/libsas/sas_ata.c      | 40 +++++++++++++++++++++++++-------------
 drivers/scsi/libsas/sas_discover.c |  2 ++
 include/linux/libata.h             |  2 ++
 include/scsi/libsas.h              |  2 +-
 6 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 27d15ed7fa3d..89cb4872a09c 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6421,6 +6421,7 @@ void ata_host_init(struct ata_host *host, struct device *dev,
 	host->n_tags = ATA_MAX_QUEUE;
 	host->dev = dev;
 	host->ops = ops;
+	kref_init(&host->kref);
 }
 
 void __ata_port_probe(struct ata_port *ap)
@@ -7388,3 +7389,5 @@ EXPORT_SYMBOL_GPL(ata_cable_80wire);
 EXPORT_SYMBOL_GPL(ata_cable_unknown);
 EXPORT_SYMBOL_GPL(ata_cable_ignore);
 EXPORT_SYMBOL_GPL(ata_cable_sata);
+EXPORT_SYMBOL_GPL(ata_host_get);
+EXPORT_SYMBOL_GPL(ata_host_put);
\ No newline at end of file
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 9e21c49cf6be..f953cb4bb1ba 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -100,8 +100,6 @@ extern int ata_port_probe(struct ata_port *ap);
 extern void __ata_port_probe(struct ata_port *ap);
 extern unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 				      u8 page, void *buf, unsigned int sectors);
-extern void ata_host_get(struct ata_host *host);
-extern void ata_host_put(struct ata_host *host);
 
 #define to_ata_port(d) container_of(d, struct ata_port, tdev)
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 2ac7395112b4..64a958a99f6a 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -552,34 +552,46 @@ int sas_ata_init(struct domain_device *found_dev)
 {
 	struct sas_ha_struct *ha = found_dev->port->ha;
 	struct Scsi_Host *shost = ha->core.shost;
+	struct ata_host *ata_host;
 	struct ata_port *ap;
 	int rc;
 
-	ata_host_init(&found_dev->sata_dev.ata_host, ha->dev, &sas_sata_ops);
-	ap = ata_sas_port_alloc(&found_dev->sata_dev.ata_host,
-				&sata_port_info,
-				shost);
+	ata_host = kzalloc(sizeof(*ata_host), GFP_KERNEL);
+	if (!ata_host)	{
+		SAS_DPRINTK("ata host alloc failed.\n");
+		return -ENOMEM;
+	}
+
+	ata_host_init(ata_host, ha->dev, &sas_sata_ops);
+
+	ap = ata_sas_port_alloc(ata_host, &sata_port_info, shost);
 	if (!ap) {
 		SAS_DPRINTK("ata_sas_port_alloc failed.\n");
-		return -ENODEV;
+		rc = -ENODEV;
+		goto free_host;
 	}
 
 	ap->private_data = found_dev;
 	ap->cbl = ATA_CBL_SATA;
 	ap->scsi_host = shost;
 	rc = ata_sas_port_init(ap);
-	if (rc) {
-		ata_sas_port_destroy(ap);
-		return rc;
-	}
-	rc = ata_sas_tport_add(found_dev->sata_dev.ata_host.dev, ap);
-	if (rc) {
-		ata_sas_port_destroy(ap);
-		return rc;
-	}
+	if (rc)
+		goto destroy_port;
+
+	rc = ata_sas_tport_add(ata_host->dev, ap);
+	if (rc)
+		goto destroy_port;
+
+	found_dev->sata_dev.ata_host = ata_host;
 	found_dev->sata_dev.ap = ap;
 
 	return 0;
+
+destroy_port:
+	ata_sas_port_destroy(ap);
+free_host:
+	ata_host_put(ata_host);
+	return rc;
 }
 
 void sas_ata_task_abort(struct sas_task *task)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index 1ffca28fe6a8..0148ae62a52a 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -316,6 +316,8 @@ void sas_free_device(struct kref *kref)
 	if (dev_is_sata(dev) && dev->sata_dev.ap) {
 		ata_sas_tport_delete(dev->sata_dev.ap);
 		ata_sas_port_destroy(dev->sata_dev.ap);
+		ata_host_put(dev->sata_dev.ata_host);
+		dev->sata_dev.ata_host = NULL;
 		dev->sata_dev.ap = NULL;
 	}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8b8946dd63b9..33e9718397e2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1110,6 +1110,8 @@ extern struct ata_host *ata_host_alloc(struct device *dev, int max_ports);
 extern struct ata_host *ata_host_alloc_pinfo(struct device *dev,
 			const struct ata_port_info * const * ppi, int n_ports);
 extern int ata_slave_link_init(struct ata_port *ap);
+extern void ata_host_get(struct ata_host *host);
+extern void ata_host_put(struct ata_host *host);
 extern int ata_host_start(struct ata_host *host);
 extern int ata_host_register(struct ata_host *host,
 			     struct scsi_host_template *sht);
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 225ab7783dfd..3de3b10da19a 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -161,7 +161,7 @@ struct sata_device {
 	u8     port_no;        /* port number, if this is a PM (Port) */
 
 	struct ata_port *ap;
-	struct ata_host ata_host;
+	struct ata_host *ata_host;
 	struct smp_resp rps_resp ____cacheline_aligned; /* report_phy_sata_resp */
 	u8     fis[ATA_RESP_FIS_SIZE];
 };
-- 
cgit v1.2.3


From 6519750210d9d3330e85d12e0128652edde448d2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Jun 2018 10:34:50 +0200
Subject: sched/swait: Remove __prepare_to_swait

There is no public user of this API, remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: bigeasy@linutronix.de
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: pbonzini@redhat.com
Link: https://lkml.kernel.org/r/20180612083909.157076812@infradead.org
---
 include/linux/swait.h | 1 -
 kernel/sched/swait.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index bf8cb0dee23c..d6a5e949e4ca 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -161,7 +161,6 @@ extern void swake_up(struct swait_queue_head *q);
 extern void swake_up_all(struct swait_queue_head *q);
 extern void swake_up_locked(struct swait_queue_head *q);
 
-extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
 extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
 
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index b6fb2c3b3ff7..e68bb1398b05 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -69,7 +69,7 @@ void swake_up_all(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_all);
 
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
 {
 	wait->task = current;
 	if (list_empty(&wait->task_list))
-- 
cgit v1.2.3


From 0abf17bc7790dd0467ed0e38522242f23c5da1c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Jun 2018 10:34:51 +0200
Subject: sched/swait: Switch to full exclusive mode

Linus noted that swait basically implements exclusive mode -- because
swake_up() only wakes a single waiter. And because of that it should
take care to properly deal with the interruptible case.

In short, the problem is that swake_up() can race with a signal. In
this this case it is possible the swake_up() 'wakes' the waiter that
is already on the way out because it just got a signal and the wakeup
gets lost.

The normal wait code is very careful and avoids this situation, make
sure we do too.

Copy the exact exclusive semantics from wait.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: bigeasy@linutronix.de
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: pbonzini@redhat.com
Link: https://lkml.kernel.org/r/20180612083909.209762413@infradead.org
---
 include/linux/swait.h | 11 ++++++-----
 kernel/sched/swait.c  | 22 +++++++++++++++++-----
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index d6a5e949e4ca..dd032061112d 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -38,8 +38,8 @@
  *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
  *    sleeper state.
  *
- *  - the exclusive mode; because this requires preserving the list order
- *    and this is hard.
+ *  - the !exclusive mode; because that leads to O(n) wakeups, everything is
+ *    exclusive.
  *
  *  - custom wake callback functions; because you cannot give any guarantees
  *    about random code. This also allows swait to be used in RT, such that
@@ -167,9 +167,10 @@ extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queu
 extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
 extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
 
-/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
 #define ___swait_event(wq, condition, state, ret, cmd)			\
 ({									\
+	__label__ __out;						\
 	struct swait_queue __wait;					\
 	long __ret = ret;						\
 									\
@@ -182,13 +183,13 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
 									\
 		if (___wait_is_interruptible(state) && __int) {		\
 			__ret = __int;					\
-			break;						\
+			goto __out;					\
 		}							\
 									\
 		cmd;							\
 	}								\
 	finish_swait(&wq, &__wait);					\
-	__ret;								\
+__out:	__ret;								\
 })
 
 #define __swait_event(wq, condition)					\
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e68bb1398b05..66890de93ee5 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -73,7 +73,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w
 {
 	wait->task = current;
 	if (list_empty(&wait->task_list))
-		list_add(&wait->task_list, &q->task_list);
+		list_add_tail(&wait->task_list, &q->task_list);
 }
 
 void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
@@ -89,12 +89,24 @@ EXPORT_SYMBOL(prepare_to_swait);
 
 long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
 {
-	if (signal_pending_state(state, current))
-		return -ERESTARTSYS;
+	unsigned long flags;
+	long ret = 0;
 
-	prepare_to_swait(q, wait, state);
+	raw_spin_lock_irqsave(&q->lock, flags);
+	if (unlikely(signal_pending_state(state, current))) {
+		/*
+		 * See prepare_to_wait_event(). TL;DR, subsequent swake_up()
+		 * must not see us.
+		 */
+		list_del_init(&wait->task_list);
+		ret = -ERESTARTSYS;
+	} else {
+		__prepare_to_swait(q, wait);
+		set_current_state(state);
+	}
+	raw_spin_unlock_irqrestore(&q->lock, flags);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(prepare_to_swait_event);
 
-- 
cgit v1.2.3


From b3dae109fa89d67334bf3349babab3ad9b6f233f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Jun 2018 10:34:52 +0200
Subject: sched/swait: Rename to exclusive

Since swait basically implemented exclusive waits only, make sure
the API reflects that.

  $ git grep -l -e "\<swake_up\>"
		-e "\<swait_event[^ (]*"
		-e "\<prepare_to_swait\>" | while read file;
    do
	sed -i -e 's/\<swake_up\>/&_one/g'
	       -e 's/\<swait_event[^ (]*/&_exclusive/g'
	       -e 's/\<prepare_to_swait\>/&_exclusive/g' $file;
    done

With a few manual touch-ups.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: bigeasy@linutronix.de
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: pbonzini@redhat.com
Link: https://lkml.kernel.org/r/20180612083909.261946548@infradead.org
---
 arch/mips/kvm/mips.c         |  4 ++--
 arch/powerpc/kvm/book3s_hv.c |  6 +++---
 arch/s390/kvm/interrupt.c    |  2 +-
 arch/x86/kernel/kvm.c        |  4 ++--
 arch/x86/kvm/lapic.c         |  2 +-
 include/linux/swait.h        | 24 ++++++++++++------------
 kernel/power/suspend.c       |  4 ++--
 kernel/rcu/srcutiny.c        |  4 ++--
 kernel/rcu/tree.c            |  8 ++++----
 kernel/rcu/tree_exp.h        |  4 ++--
 kernel/rcu/tree_plugin.h     | 12 ++++++------
 kernel/sched/swait.c         | 10 +++++-----
 virt/kvm/arm/arm.c           |  4 ++--
 virt/kvm/arm/psci.c          |  2 +-
 virt/kvm/async_pf.c          |  2 +-
 virt/kvm/kvm_main.c          |  4 ++--
 16 files changed, 48 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7cd76f93a438..f7ea8e21656b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 	dvcpu->arch.wait = 0;
 
 	if (swq_has_sleeper(&dvcpu->wq))
-		swake_up(&dvcpu->wq);
+		swake_up_one(&dvcpu->wq);
 
 	return 0;
 }
@@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
 
 	vcpu->arch.wait = 0;
 	if (swq_has_sleeper(&vcpu->wq))
-		swake_up(&vcpu->wq);
+		swake_up_one(&vcpu->wq);
 }
 
 /* low level hrtimer wake routine */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de686b340f4a..ee4a8854985e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
 	if (swq_has_sleeper(wqp)) {
-		swake_up(wqp);
+		swake_up_one(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 		}
 	}
 
-	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 
 	if (kvmppc_vcore_check_block(vc)) {
 		finish_swait(&vc->wq, &wait);
@@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_start_thread(vcpu, vc);
 			trace_kvm_guest_enter(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
-			swake_up(&vc->wq);
+			swake_up_one(&vc->wq);
 		}
 
 	}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index daa09f89ca2d..fcb55b02990e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 		 * yield-candidate.
 		 */
 		vcpu->preempted = true;
-		swake_up(&vcpu->wq);
+		swake_up_one(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
 	/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..a37bda38d205 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
 
 	for (;;) {
 		if (!n.halted)
-			prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+			prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
 
@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 	if (n->halted)
 		smp_send_reschedule(n->cpu);
 	else if (swq_has_sleeper(&n->wq))
-		swake_up(&n->wq);
+		swake_up_one(&n->wq);
 }
 
 static void apf_task_wake_all(void)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..d536d457517b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	 * using swait_active() is safe.
 	 */
 	if (swait_active(q))
-		swake_up(q);
+		swake_up_one(q);
 
 	if (apic_lvtt_tscdeadline(apic))
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/swait.h b/include/linux/swait.h
index dd032061112d..73e06e9986d4 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -16,7 +16,7 @@
  * wait-queues, but the semantics are actually completely different, and
  * every single user we have ever had has been buggy (or pointless).
  *
- * A "swake_up()" only wakes up _one_ waiter, which is not at all what
+ * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
  * "wake_up()" does, and has led to problems. In other cases, it has
  * been fine, because there's only ever one waiter (kvm), but in that
  * case gthe whole "simple" wait-queue is just pointless to begin with,
@@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
  *      CPU0 - waker                    CPU1 - waiter
  *
  *                                      for (;;) {
- *      @cond = true;                     prepare_to_swait(&wq_head, &wait, state);
+ *      @cond = true;                     prepare_to_swait_exclusive(&wq_head, &wait, state);
  *      smp_mb();                         // smp_mb() from set_current_state()
  *      if (swait_active(wq_head))        if (@cond)
  *        wake_up(wq_head);                      break;
@@ -157,11 +157,11 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
 	return swait_active(wq);
 }
 
-extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_one(struct swait_queue_head *q);
 extern void swake_up_all(struct swait_queue_head *q);
 extern void swake_up_locked(struct swait_queue_head *q);
 
-extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
 extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
 
 extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
@@ -196,7 +196,7 @@ __out:	__ret;								\
 	(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,	\
 			    schedule())
 
-#define swait_event(wq, condition)					\
+#define swait_event_exclusive(wq, condition)				\
 do {									\
 	if (condition)							\
 		break;							\
@@ -208,7 +208,7 @@ do {									\
 		      TASK_UNINTERRUPTIBLE, timeout,			\
 		      __ret = schedule_timeout(__ret))
 
-#define swait_event_timeout(wq, condition, timeout)			\
+#define swait_event_timeout_exclusive(wq, condition, timeout)		\
 ({									\
 	long __ret = timeout;						\
 	if (!___wait_cond_timeout(condition))				\
@@ -220,7 +220,7 @@ do {									\
 	___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,		\
 		      schedule())
 
-#define swait_event_interruptible(wq, condition)			\
+#define swait_event_interruptible_exclusive(wq, condition)		\
 ({									\
 	int __ret = 0;							\
 	if (!(condition))						\
@@ -233,7 +233,7 @@ do {									\
 		      TASK_INTERRUPTIBLE, timeout,			\
 		      __ret = schedule_timeout(__ret))
 
-#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
 ({									\
 	long __ret = timeout;						\
 	if (!___wait_cond_timeout(condition))				\
@@ -246,7 +246,7 @@ do {									\
 	(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
 
 /**
- * swait_event_idle - wait without system load contribution
+ * swait_event_idle_exclusive - wait without system load contribution
  * @wq: the waitqueue to wait on
  * @condition: a C expression for the event to wait for
  *
@@ -257,7 +257,7 @@ do {									\
  * condition and doesn't want to contribute to system load. Signals are
  * ignored.
  */
-#define swait_event_idle(wq, condition)					\
+#define swait_event_idle_exclusive(wq, condition)			\
 do {									\
 	if (condition)							\
 		break;							\
@@ -270,7 +270,7 @@ do {									\
 		       __ret = schedule_timeout(__ret))
 
 /**
- * swait_event_idle_timeout - wait up to timeout without load contribution
+ * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
  * @wq: the waitqueue to wait on
  * @condition: a C expression for the event to wait for
  * @timeout: timeout at which we'll give up in jiffies
@@ -288,7 +288,7 @@ do {									\
  * or the remaining jiffies (at least 1) if the @condition evaluated
  * to %true before the @timeout elapsed.
  */
-#define swait_event_idle_timeout(wq, condition, timeout)		\
+#define swait_event_idle_timeout_exclusive(wq, condition, timeout)	\
 ({									\
 	long __ret = timeout;						\
 	if (!___wait_cond_timeout(condition))				\
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..70178f6ffdc4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -92,7 +92,7 @@ static void s2idle_enter(void)
 	/* Push all the CPUs into the idle loop. */
 	wake_up_all_idle_cpus();
 	/* Make the current CPU wait so it can enter the idle loop too. */
-	swait_event(s2idle_wait_head,
+	swait_event_exclusive(s2idle_wait_head,
 		    s2idle_state == S2IDLE_STATE_WAKE);
 
 	cpuidle_pause();
@@ -160,7 +160,7 @@ void s2idle_wake(void)
 	raw_spin_lock_irqsave(&s2idle_lock, flags);
 	if (s2idle_state > S2IDLE_STATE_NONE) {
 		s2idle_state = S2IDLE_STATE_WAKE;
-		swake_up(&s2idle_wait_head);
+		swake_up_one(&s2idle_wait_head);
 	}
 	raw_spin_unlock_irqrestore(&s2idle_lock, flags);
 }
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 622792abe41a..04fc2ed71af8 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 
 	WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
 	if (!newval && READ_ONCE(sp->srcu_gp_waiting))
-		swake_up(&sp->srcu_wq);
+		swake_up_one(&sp->srcu_wq);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
 	idx = sp->srcu_idx;
 	WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
 	WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
-	swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+	swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
 	WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
 
 	/* Invoke the callbacks we removed above. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index aa7cade1b9f3..91f888d3b23a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1727,7 +1727,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
-	swake_up(&rsp->gp_wq);
+	swake_up_one(&rsp->gp_wq);
 }
 
 /*
@@ -2002,7 +2002,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 }
 
 /*
- * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
  * time.
  */
 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
@@ -2144,7 +2144,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+			swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
 						     RCU_GP_FLAG_INIT);
 			rsp->gp_state = RCU_GP_DONE_GPS;
 			/* Locking provides needed memory barrier. */
@@ -2176,7 +2176,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = swait_event_idle_timeout(rsp->gp_wq,
+			ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
 					rcu_gp_fqs_check_wake(rsp, &gf), j);
 			rsp->gp_state = RCU_GP_DOING_FQS;
 			/* Locking provides needed memory barriers. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d40708e8c5d6..d428cc1064c8 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				swake_up(&rsp->expedited_wq);
+				swake_up_one(&rsp->expedited_wq);
 			}
 			break;
 		}
@@ -518,7 +518,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	jiffies_start = jiffies;
 
 	for (;;) {
-		ret = swait_event_timeout(
+		ret = swait_event_timeout_exclusive(
 				rsp->expedited_wq,
 				sync_rcu_preempt_exp_done_unlocked(rnp_root),
 				jiffies_stall);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7fd12039e512..ad53d133f709 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1854,8 +1854,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
 		del_timer(&rdp->nocb_timer);
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-		smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
-		swake_up(&rdp_leader->nocb_wq);
+		smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
+		swake_up_one(&rdp_leader->nocb_wq);
 	} else {
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 	}
@@ -2082,7 +2082,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 */
 	trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
-		swait_event_interruptible(
+		swait_event_interruptible_exclusive(
 			rnp->nocb_gp_wq[c & 0x1],
 			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
@@ -2111,7 +2111,7 @@ wait_again:
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
-		swait_event_interruptible(my_rdp->nocb_wq,
+		swait_event_interruptible_exclusive(my_rdp->nocb_wq,
 				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
 		my_rdp->nocb_leader_sleep = true;
@@ -2176,7 +2176,7 @@ wait_again:
 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
 			/* List was empty, so wake up the follower.  */
-			swake_up(&rdp->nocb_wq);
+			swake_up_one(&rdp->nocb_wq);
 		}
 	}
 
@@ -2193,7 +2193,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 {
 	for (;;) {
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
-		swait_event_interruptible(rdp->nocb_wq,
+		swait_event_interruptible_exclusive(rdp->nocb_wq,
 					 READ_ONCE(rdp->nocb_follower_head));
 		if (smp_load_acquire(&rdp->nocb_follower_head)) {
 			/* ^^^ Ensure CB invocation follows _head test. */
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 66890de93ee5..66b59ac77c22 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_locked);
 
-void swake_up(struct swait_queue_head *q)
+void swake_up_one(struct swait_queue_head *q)
 {
 	unsigned long flags;
 
@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
 	swake_up_locked(q);
 	raw_spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(swake_up);
+EXPORT_SYMBOL(swake_up_one);
 
 /*
  * Does not allow usage from IRQ disabled, since we must be able to
@@ -76,7 +76,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w
 		list_add_tail(&wait->task_list, &q->task_list);
 }
 
-void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
 {
 	unsigned long flags;
 
@@ -85,7 +85,7 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
 	set_current_state(state);
 	raw_spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(prepare_to_swait);
+EXPORT_SYMBOL(prepare_to_swait_exclusive);
 
 long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
 {
@@ -95,7 +95,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait
 	raw_spin_lock_irqsave(&q->lock, flags);
 	if (unlikely(signal_pending_state(state, current))) {
 		/*
-		 * See prepare_to_wait_event(). TL;DR, subsequent swake_up()
+		 * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
 		 * must not see us.
 		 */
 		list_del_init(&wait->task_list);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 04e554cae3a2..108250e4d376 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		vcpu->arch.pause = false;
-		swake_up(kvm_arch_vcpu_wq(vcpu));
+		swake_up_one(kvm_arch_vcpu_wq(vcpu));
 	}
 }
 
@@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
 {
 	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
-	swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+	swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
 
 	if (vcpu->arch.power_off || vcpu->arch.pause) {
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index c95ab4c5a475..9b73d3ad918a 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	smp_mb();		/* Make sure the above is visible */
 
 	wq = kvm_arch_vcpu_wq(vcpu);
-	swake_up(wq);
+	swake_up_one(wq);
 
 	return PSCI_RET_SUCCESS;
 }
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 57bcb27dcf30..23c2519c5b32 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
 	trace_kvm_async_pf_completed(addr, gva);
 
 	if (swq_has_sleeper(&vcpu->wq))
-		swake_up(&vcpu->wq);
+		swake_up_one(&vcpu->wq);
 
 	mmput(mm);
 	kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ada21f47f22b..940a4aed5b2d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2167,7 +2167,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_blocking(vcpu);
 
 	for (;;) {
-		prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
@@ -2209,7 +2209,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
 	if (swq_has_sleeper(wqp)) {
-		swake_up(wqp);
+		swake_up_one(wqp);
 		++vcpu->stat.halt_wakeup;
 		return true;
 	}
-- 
cgit v1.2.3


From a19741e5e5a9f1f02f8e3c037bde7d73d4bfae9c Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 28 May 2018 11:47:52 +0200
Subject: dma_buf: remove device parameter from attach callback v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The device parameter is completely unused because it is available in the
attachment structure as well.

v2: fix kerneldoc as well

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/226643/
---
 drivers/dma-buf/dma-buf.c                             |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c             |  3 +--
 drivers/gpu/drm/drm_prime.c                           |  3 +--
 drivers/gpu/drm/udl/udl_dmabuf.c                      |  1 -
 drivers/gpu/drm/vmwgfx/vmwgfx_prime.c                 |  1 -
 drivers/media/common/videobuf2/videobuf2-dma-contig.c |  2 +-
 drivers/media/common/videobuf2/videobuf2-dma-sg.c     |  2 +-
 drivers/media/common/videobuf2/videobuf2-vmalloc.c    |  2 +-
 include/drm/drm_prime.h                               |  2 +-
 include/linux/dma-buf.h                               | 13 ++++++-------
 10 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 4c45e31258f0..50771063c617 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -568,7 +568,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	mutex_lock(&dmabuf->lock);
 
 	if (dmabuf->ops->attach) {
-		ret = dmabuf->ops->attach(dmabuf, dev, attach);
+		ret = dmabuf->ops->attach(dmabuf, attach);
 		if (ret)
 			goto err_attach;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
index 4b584cb75bf4..bbbb4f9578db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
@@ -127,7 +127,6 @@ error:
 }
 
 static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,
-				 struct device *target_dev,
 				 struct dma_buf_attachment *attach)
 {
 	struct drm_gem_object *obj = dma_buf->priv;
@@ -135,7 +134,7 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	long r;
 
-	r = drm_gem_map_attach(dma_buf, target_dev, attach);
+	r = drm_gem_map_attach(dma_buf, attach);
 	if (r)
 		return r;
 
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 397b46b33739..0055d919070c 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -186,7 +186,6 @@ static int drm_prime_lookup_buf_handle(struct drm_prime_file_private *prime_fpri
 /**
  * drm_gem_map_attach - dma_buf attach implementation for GEM
  * @dma_buf: buffer to attach device to
- * @target_dev: not used
  * @attach: buffer attachment data
  *
  * Allocates &drm_prime_attachment and calls &drm_driver.gem_prime_pin for
@@ -195,7 +194,7 @@ static int drm_prime_lookup_buf_handle(struct drm_prime_file_private *prime_fpri
  *
  * Returns 0 on success, negative error code on failure.
  */
-int drm_gem_map_attach(struct dma_buf *dma_buf, struct device *target_dev,
+int drm_gem_map_attach(struct dma_buf *dma_buf,
 		       struct dma_buf_attachment *attach)
 {
 	struct drm_prime_attachment *prime_attach;
diff --git a/drivers/gpu/drm/udl/udl_dmabuf.c b/drivers/gpu/drm/udl/udl_dmabuf.c
index 0a20695eb120..91ae60309d3e 100644
--- a/drivers/gpu/drm/udl/udl_dmabuf.c
+++ b/drivers/gpu/drm/udl/udl_dmabuf.c
@@ -29,7 +29,6 @@ struct udl_drm_dmabuf_attachment {
 };
 
 static int udl_attach_dma_buf(struct dma_buf *dmabuf,
-			      struct device *dev,
 			      struct dma_buf_attachment *attach)
 {
 	struct udl_drm_dmabuf_attachment *udl_attach;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
index 0d42a46521fc..fbffb37ccf42 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
@@ -40,7 +40,6 @@
  */
 
 static int vmw_prime_map_attach(struct dma_buf *dma_buf,
-				struct device *target_dev,
 				struct dma_buf_attachment *attach)
 {
 	return -ENOSYS;
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index f1178f6f434d..12d0072c52c2 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -222,7 +222,7 @@ struct vb2_dc_attachment {
 	enum dma_data_direction dma_dir;
 };
 
-static int vb2_dc_dmabuf_ops_attach(struct dma_buf *dbuf, struct device *dev,
+static int vb2_dc_dmabuf_ops_attach(struct dma_buf *dbuf,
 	struct dma_buf_attachment *dbuf_attach)
 {
 	struct vb2_dc_attachment *attach;
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 753ed3138dcc..cf94765e593f 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -371,7 +371,7 @@ struct vb2_dma_sg_attachment {
 	enum dma_data_direction dma_dir;
 };
 
-static int vb2_dma_sg_dmabuf_ops_attach(struct dma_buf *dbuf, struct device *dev,
+static int vb2_dma_sg_dmabuf_ops_attach(struct dma_buf *dbuf,
 	struct dma_buf_attachment *dbuf_attach)
 {
 	struct vb2_dma_sg_attachment *attach;
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index 359fb9804d16..cdec023a918d 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -209,7 +209,7 @@ struct vb2_vmalloc_attachment {
 	enum dma_data_direction dma_dir;
 };
 
-static int vb2_vmalloc_dmabuf_ops_attach(struct dma_buf *dbuf, struct device *dev,
+static int vb2_vmalloc_dmabuf_ops_attach(struct dma_buf *dbuf,
 	struct dma_buf_attachment *dbuf_attach)
 {
 	struct vb2_vmalloc_attachment *attach;
diff --git a/include/drm/drm_prime.h b/include/drm/drm_prime.h
index 4d5f5d6cf6a6..ef338151cea8 100644
--- a/include/drm/drm_prime.h
+++ b/include/drm/drm_prime.h
@@ -82,7 +82,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev,
 struct dma_buf *drm_gem_dmabuf_export(struct drm_device *dev,
 				      struct dma_buf_export_info *exp_info);
 void drm_gem_dmabuf_release(struct dma_buf *dma_buf);
-int drm_gem_map_attach(struct dma_buf *dma_buf, struct device *target_dev,
+int drm_gem_map_attach(struct dma_buf *dma_buf,
 		       struct dma_buf_attachment *attach);
 void drm_gem_map_detach(struct dma_buf *dma_buf,
 			struct dma_buf_attachment *attach);
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 88917fa796e4..c0ad5bf61188 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -55,11 +55,11 @@ struct dma_buf_ops {
 	 * @attach:
 	 *
 	 * This is called from dma_buf_attach() to make sure that a given
-	 * &device can access the provided &dma_buf. Exporters which support
-	 * buffer objects in special locations like VRAM or device-specific
-	 * carveout areas should check whether the buffer could be move to
-	 * system memory (or directly accessed by the provided device), and
-	 * otherwise need to fail the attach operation.
+	 * &dma_buf_attachment.dev can access the provided &dma_buf. Exporters
+	 * which support buffer objects in special locations like VRAM or
+	 * device-specific carveout areas should check whether the buffer could
+	 * be move to system memory (or directly accessed by the provided
+	 * device), and otherwise need to fail the attach operation.
 	 *
 	 * The exporter should also in general check whether the current
 	 * allocation fullfills the DMA constraints of the new device. If this
@@ -77,8 +77,7 @@ struct dma_buf_ops {
 	 * to signal that backing storage is already allocated and incompatible
 	 * with the requirements of requesting device.
 	 */
-	int (*attach)(struct dma_buf *, struct device *,
-		      struct dma_buf_attachment *);
+	int (*attach)(struct dma_buf *, struct dma_buf_attachment *);
 
 	/**
 	 * @detach:
-- 
cgit v1.2.3


From f664a52695429b68afb4e130a0f69cd5fd1fec86 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 28 May 2018 13:34:01 +0200
Subject: dma-buf: remove kmap_atomic interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Neither used nor correctly implemented anywhere. Just completely remove
the interface.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/226645/
---
 drivers/dma-buf/dma-buf.c                          | 54 +---------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c          |  2 -
 drivers/gpu/drm/armada/armada_gem.c                |  2 -
 drivers/gpu/drm/drm_prime.c                        | 31 -------------
 drivers/gpu/drm/i915/i915_gem_dmabuf.c             | 11 -----
 drivers/gpu/drm/i915/selftests/mock_dmabuf.c       |  2 -
 drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c          |  2 -
 drivers/gpu/drm/tegra/gem.c                        | 14 ------
 drivers/gpu/drm/udl/udl_dmabuf.c                   | 17 -------
 drivers/gpu/drm/vmwgfx/vmwgfx_prime.c              | 13 ------
 .../media/common/videobuf2/videobuf2-dma-contig.c  |  1 -
 drivers/media/common/videobuf2/videobuf2-dma-sg.c  |  1 -
 drivers/media/common/videobuf2/videobuf2-vmalloc.c |  1 -
 drivers/staging/android/ion/ion.c                  |  2 -
 drivers/tee/tee_shm.c                              |  6 ---
 include/drm/drm_prime.h                            |  4 --
 include/linux/dma-buf.h                            |  4 --
 17 files changed, 2 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 50771063c617..13884474d158 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -405,7 +405,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 			  || !exp_info->ops->map_dma_buf
 			  || !exp_info->ops->unmap_dma_buf
 			  || !exp_info->ops->release
-			  || !exp_info->ops->map_atomic
 			  || !exp_info->ops->map
 			  || !exp_info->ops->mmap)) {
 		return ERR_PTR(-EINVAL);
@@ -687,26 +686,14 @@ EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
  *      void \*dma_buf_kmap(struct dma_buf \*, unsigned long);
  *      void dma_buf_kunmap(struct dma_buf \*, unsigned long, void \*);
  *
- *   There are also atomic variants of these interfaces. Like for kmap they
- *   facilitate non-blocking fast-paths. Neither the importer nor the exporter
- *   (in the callback) is allowed to block when using these.
- *
- *   Interfaces::
- *      void \*dma_buf_kmap_atomic(struct dma_buf \*, unsigned long);
- *      void dma_buf_kunmap_atomic(struct dma_buf \*, unsigned long, void \*);
- *
- *   For importers all the restrictions of using kmap apply, like the limited
- *   supply of kmap_atomic slots. Hence an importer shall only hold onto at
- *   max 2 atomic dma_buf kmaps at the same time (in any given process context).
+ *   Implementing the functions is optional for exporters and for importers all
+ *   the restrictions of using kmap apply.
  *
  *   dma_buf kmap calls outside of the range specified in begin_cpu_access are
  *   undefined. If the range is not PAGE_SIZE aligned, kmap needs to succeed on
  *   the partial chunks at the beginning and end but may return stale or bogus
  *   data outside of the range (in these partial chunks).
  *
- *   Note that these calls need to always succeed. The exporter needs to
- *   complete any preparations that might fail in begin_cpu_access.
- *
  *   For some cases the overhead of kmap can be too high, a vmap interface
  *   is introduced. This interface should be used very carefully, as vmalloc
  *   space is a limited resources on many architectures.
@@ -859,43 +846,6 @@ int dma_buf_end_cpu_access(struct dma_buf *dmabuf,
 }
 EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access);
 
-/**
- * dma_buf_kmap_atomic - Map a page of the buffer object into kernel address
- * space. The same restrictions as for kmap_atomic and friends apply.
- * @dmabuf:	[in]	buffer to map page from.
- * @page_num:	[in]	page in PAGE_SIZE units to map.
- *
- * This call must always succeed, any necessary preparations that might fail
- * need to be done in begin_cpu_access.
- */
-void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, unsigned long page_num)
-{
-	WARN_ON(!dmabuf);
-
-	if (!dmabuf->ops->map_atomic)
-		return NULL;
-	return dmabuf->ops->map_atomic(dmabuf, page_num);
-}
-EXPORT_SYMBOL_GPL(dma_buf_kmap_atomic);
-
-/**
- * dma_buf_kunmap_atomic - Unmap a page obtained by dma_buf_kmap_atomic.
- * @dmabuf:	[in]	buffer to unmap page from.
- * @page_num:	[in]	page in PAGE_SIZE units to unmap.
- * @vaddr:	[in]	kernel space pointer obtained from dma_buf_kmap_atomic.
- *
- * This call must always succeed.
- */
-void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, unsigned long page_num,
-			   void *vaddr)
-{
-	WARN_ON(!dmabuf);
-
-	if (dmabuf->ops->unmap_atomic)
-		dmabuf->ops->unmap_atomic(dmabuf, page_num, vaddr);
-}
-EXPORT_SYMBOL_GPL(dma_buf_kunmap_atomic);
-
 /**
  * dma_buf_kmap - Map a page of the buffer object into kernel address space. The
  * same restrictions as for kmap and friends apply.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
index bbbb4f9578db..0558da049827 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
@@ -238,9 +238,7 @@ static const struct dma_buf_ops amdgpu_dmabuf_ops = {
 	.release = drm_gem_dmabuf_release,
 	.begin_cpu_access = amdgpu_gem_begin_cpu_access,
 	.map = drm_gem_dmabuf_kmap,
-	.map_atomic = drm_gem_dmabuf_kmap_atomic,
 	.unmap = drm_gem_dmabuf_kunmap,
-	.unmap_atomic = drm_gem_dmabuf_kunmap_atomic,
 	.mmap = drm_gem_dmabuf_mmap,
 	.vmap = drm_gem_dmabuf_vmap,
 	.vunmap = drm_gem_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index a97f509743a5..3fb37c75c065 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -490,8 +490,6 @@ static const struct dma_buf_ops armada_gem_prime_dmabuf_ops = {
 	.map_dma_buf	= armada_gem_prime_map_dma_buf,
 	.unmap_dma_buf	= armada_gem_prime_unmap_dma_buf,
 	.release	= drm_gem_dmabuf_release,
-	.map_atomic	= armada_gem_dmabuf_no_kmap,
-	.unmap_atomic	= armada_gem_dmabuf_no_kunmap,
 	.map		= armada_gem_dmabuf_no_kmap,
 	.unmap		= armada_gem_dmabuf_no_kunmap,
 	.mmap		= armada_gem_dmabuf_mmap,
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 0055d919070c..186db2e4c57a 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -433,35 +433,6 @@ void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
 }
 EXPORT_SYMBOL(drm_gem_dmabuf_vunmap);
 
-/**
- * drm_gem_dmabuf_kmap_atomic - map_atomic implementation for GEM
- * @dma_buf: buffer to be mapped
- * @page_num: page number within the buffer
- *
- * Not implemented. This can be used as the &dma_buf_ops.map_atomic callback.
- */
-void *drm_gem_dmabuf_kmap_atomic(struct dma_buf *dma_buf,
-				 unsigned long page_num)
-{
-	return NULL;
-}
-EXPORT_SYMBOL(drm_gem_dmabuf_kmap_atomic);
-
-/**
- * drm_gem_dmabuf_kunmap_atomic - unmap_atomic implementation for GEM
- * @dma_buf: buffer to be unmapped
- * @page_num: page number within the buffer
- * @addr: virtual address of the buffer
- *
- * Not implemented. This can be used as the &dma_buf_ops.unmap_atomic callback.
- */
-void drm_gem_dmabuf_kunmap_atomic(struct dma_buf *dma_buf,
-				  unsigned long page_num, void *addr)
-{
-
-}
-EXPORT_SYMBOL(drm_gem_dmabuf_kunmap_atomic);
-
 /**
  * drm_gem_dmabuf_kmap - map implementation for GEM
  * @dma_buf: buffer to be mapped
@@ -519,9 +490,7 @@ static const struct dma_buf_ops drm_gem_prime_dmabuf_ops =  {
 	.unmap_dma_buf = drm_gem_unmap_dma_buf,
 	.release = drm_gem_dmabuf_release,
 	.map = drm_gem_dmabuf_kmap,
-	.map_atomic = drm_gem_dmabuf_kmap_atomic,
 	.unmap = drm_gem_dmabuf_kunmap,
-	.unmap_atomic = drm_gem_dmabuf_kunmap_atomic,
 	.mmap = drm_gem_dmabuf_mmap,
 	.vmap = drm_gem_dmabuf_vmap,
 	.vunmap = drm_gem_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 69a7aec49e84..82e2ca17a441 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -111,15 +111,6 @@ static void i915_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
 	i915_gem_object_unpin_map(obj);
 }
 
-static void *i915_gem_dmabuf_kmap_atomic(struct dma_buf *dma_buf, unsigned long page_num)
-{
-	return NULL;
-}
-
-static void i915_gem_dmabuf_kunmap_atomic(struct dma_buf *dma_buf, unsigned long page_num, void *addr)
-{
-
-}
 static void *i915_gem_dmabuf_kmap(struct dma_buf *dma_buf, unsigned long page_num)
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
@@ -225,9 +216,7 @@ static const struct dma_buf_ops i915_dmabuf_ops =  {
 	.unmap_dma_buf = i915_gem_unmap_dma_buf,
 	.release = drm_gem_dmabuf_release,
 	.map = i915_gem_dmabuf_kmap,
-	.map_atomic = i915_gem_dmabuf_kmap_atomic,
 	.unmap = i915_gem_dmabuf_kunmap,
-	.unmap_atomic = i915_gem_dmabuf_kunmap_atomic,
 	.mmap = i915_gem_dmabuf_mmap,
 	.vmap = i915_gem_dmabuf_vmap,
 	.vunmap = i915_gem_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/i915/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
index 302f7d103635..f81fda8ea45e 100644
--- a/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
@@ -130,9 +130,7 @@ static const struct dma_buf_ops mock_dmabuf_ops =  {
 	.unmap_dma_buf = mock_unmap_dma_buf,
 	.release = mock_dmabuf_release,
 	.map = mock_dmabuf_kmap,
-	.map_atomic = mock_dmabuf_kmap_atomic,
 	.unmap = mock_dmabuf_kunmap,
-	.unmap_atomic = mock_dmabuf_kunmap_atomic,
 	.mmap = mock_dmabuf_mmap,
 	.vmap = mock_dmabuf_vmap,
 	.vunmap = mock_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index 8e41d649e248..1a073f9b2834 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -148,8 +148,6 @@ static const struct dma_buf_ops omap_dmabuf_ops = {
 	.release = drm_gem_dmabuf_release,
 	.begin_cpu_access = omap_gem_dmabuf_begin_cpu_access,
 	.end_cpu_access = omap_gem_dmabuf_end_cpu_access,
-	.map_atomic = omap_gem_dmabuf_kmap_atomic,
-	.unmap_atomic = omap_gem_dmabuf_kunmap_atomic,
 	.map = omap_gem_dmabuf_kmap,
 	.unmap = omap_gem_dmabuf_kunmap,
 	.mmap = omap_gem_dmabuf_mmap,
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 8b0b4ff64bb4..d7661702c11c 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -596,18 +596,6 @@ static int tegra_gem_prime_end_cpu_access(struct dma_buf *buf,
 	return 0;
 }
 
-static void *tegra_gem_prime_kmap_atomic(struct dma_buf *buf,
-					 unsigned long page)
-{
-	return NULL;
-}
-
-static void tegra_gem_prime_kunmap_atomic(struct dma_buf *buf,
-					  unsigned long page,
-					  void *addr)
-{
-}
-
 static void *tegra_gem_prime_kmap(struct dma_buf *buf, unsigned long page)
 {
 	return NULL;
@@ -648,8 +636,6 @@ static const struct dma_buf_ops tegra_gem_prime_dmabuf_ops = {
 	.release = tegra_gem_prime_release,
 	.begin_cpu_access = tegra_gem_prime_begin_cpu_access,
 	.end_cpu_access = tegra_gem_prime_end_cpu_access,
-	.map_atomic = tegra_gem_prime_kmap_atomic,
-	.unmap_atomic = tegra_gem_prime_kunmap_atomic,
 	.map = tegra_gem_prime_kmap,
 	.unmap = tegra_gem_prime_kunmap,
 	.mmap = tegra_gem_prime_mmap,
diff --git a/drivers/gpu/drm/udl/udl_dmabuf.c b/drivers/gpu/drm/udl/udl_dmabuf.c
index 91ae60309d3e..556f62662aa9 100644
--- a/drivers/gpu/drm/udl/udl_dmabuf.c
+++ b/drivers/gpu/drm/udl/udl_dmabuf.c
@@ -157,27 +157,12 @@ static void *udl_dmabuf_kmap(struct dma_buf *dma_buf, unsigned long page_num)
 	return NULL;
 }
 
-static void *udl_dmabuf_kmap_atomic(struct dma_buf *dma_buf,
-				    unsigned long page_num)
-{
-	/* TODO */
-
-	return NULL;
-}
-
 static void udl_dmabuf_kunmap(struct dma_buf *dma_buf,
 			      unsigned long page_num, void *addr)
 {
 	/* TODO */
 }
 
-static void udl_dmabuf_kunmap_atomic(struct dma_buf *dma_buf,
-				     unsigned long page_num,
-				     void *addr)
-{
-	/* TODO */
-}
-
 static int udl_dmabuf_mmap(struct dma_buf *dma_buf,
 			   struct vm_area_struct *vma)
 {
@@ -192,9 +177,7 @@ static const struct dma_buf_ops udl_dmabuf_ops = {
 	.map_dma_buf		= udl_map_dma_buf,
 	.unmap_dma_buf		= udl_unmap_dma_buf,
 	.map			= udl_dmabuf_kmap,
-	.map_atomic		= udl_dmabuf_kmap_atomic,
 	.unmap			= udl_dmabuf_kunmap,
-	.unmap_atomic		= udl_dmabuf_kunmap_atomic,
 	.mmap			= udl_dmabuf_mmap,
 	.release		= drm_gem_dmabuf_release,
 };
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
index fbffb37ccf42..373bc6da2f84 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
@@ -71,17 +71,6 @@ static void vmw_prime_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
 {
 }
 
-static void *vmw_prime_dmabuf_kmap_atomic(struct dma_buf *dma_buf,
-		unsigned long page_num)
-{
-	return NULL;
-}
-
-static void vmw_prime_dmabuf_kunmap_atomic(struct dma_buf *dma_buf,
-		unsigned long page_num, void *addr)
-{
-
-}
 static void *vmw_prime_dmabuf_kmap(struct dma_buf *dma_buf,
 		unsigned long page_num)
 {
@@ -108,9 +97,7 @@ const struct dma_buf_ops vmw_prime_dmabuf_ops =  {
 	.unmap_dma_buf = vmw_prime_unmap_dma_buf,
 	.release = NULL,
 	.map = vmw_prime_dmabuf_kmap,
-	.map_atomic = vmw_prime_dmabuf_kmap_atomic,
 	.unmap = vmw_prime_dmabuf_kunmap,
-	.unmap_atomic = vmw_prime_dmabuf_kunmap_atomic,
 	.mmap = vmw_prime_dmabuf_mmap,
 	.vmap = vmw_prime_dmabuf_vmap,
 	.vunmap = vmw_prime_dmabuf_vunmap,
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index 12d0072c52c2..aff0ab7bf83d 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -358,7 +358,6 @@ static const struct dma_buf_ops vb2_dc_dmabuf_ops = {
 	.map_dma_buf = vb2_dc_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_dc_dmabuf_ops_unmap,
 	.map = vb2_dc_dmabuf_ops_kmap,
-	.map_atomic = vb2_dc_dmabuf_ops_kmap,
 	.vmap = vb2_dc_dmabuf_ops_vmap,
 	.mmap = vb2_dc_dmabuf_ops_mmap,
 	.release = vb2_dc_dmabuf_ops_release,
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index cf94765e593f..015e737095cd 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -507,7 +507,6 @@ static const struct dma_buf_ops vb2_dma_sg_dmabuf_ops = {
 	.map_dma_buf = vb2_dma_sg_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_dma_sg_dmabuf_ops_unmap,
 	.map = vb2_dma_sg_dmabuf_ops_kmap,
-	.map_atomic = vb2_dma_sg_dmabuf_ops_kmap,
 	.vmap = vb2_dma_sg_dmabuf_ops_vmap,
 	.mmap = vb2_dma_sg_dmabuf_ops_mmap,
 	.release = vb2_dma_sg_dmabuf_ops_release,
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index cdec023a918d..6dfbd5b05907 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -346,7 +346,6 @@ static const struct dma_buf_ops vb2_vmalloc_dmabuf_ops = {
 	.map_dma_buf = vb2_vmalloc_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_vmalloc_dmabuf_ops_unmap,
 	.map = vb2_vmalloc_dmabuf_ops_kmap,
-	.map_atomic = vb2_vmalloc_dmabuf_ops_kmap,
 	.vmap = vb2_vmalloc_dmabuf_ops_vmap,
 	.mmap = vb2_vmalloc_dmabuf_ops_mmap,
 	.release = vb2_vmalloc_dmabuf_ops_release,
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index e74db7902549..a5220445b5e8 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -369,8 +369,6 @@ static const struct dma_buf_ops dma_buf_ops = {
 	.detach = ion_dma_buf_detatch,
 	.begin_cpu_access = ion_dma_buf_begin_cpu_access,
 	.end_cpu_access = ion_dma_buf_end_cpu_access,
-	.map_atomic = ion_dma_buf_kmap,
-	.unmap_atomic = ion_dma_buf_kunmap,
 	.map = ion_dma_buf_kmap,
 	.unmap = ion_dma_buf_kunmap,
 };
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 556960a1bab3..df4a1553b78b 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -80,11 +80,6 @@ static void tee_shm_op_release(struct dma_buf *dmabuf)
 	tee_shm_release(shm);
 }
 
-static void *tee_shm_op_map_atomic(struct dma_buf *dmabuf, unsigned long pgnum)
-{
-	return NULL;
-}
-
 static void *tee_shm_op_map(struct dma_buf *dmabuf, unsigned long pgnum)
 {
 	return NULL;
@@ -107,7 +102,6 @@ static const struct dma_buf_ops tee_shm_dma_buf_ops = {
 	.map_dma_buf = tee_shm_op_map_dma_buf,
 	.unmap_dma_buf = tee_shm_op_unmap_dma_buf,
 	.release = tee_shm_op_release,
-	.map_atomic = tee_shm_op_map_atomic,
 	.map = tee_shm_op_map,
 	.mmap = tee_shm_op_mmap,
 };
diff --git a/include/drm/drm_prime.h b/include/drm/drm_prime.h
index ef338151cea8..d716d653b096 100644
--- a/include/drm/drm_prime.h
+++ b/include/drm/drm_prime.h
@@ -93,10 +93,6 @@ void drm_gem_unmap_dma_buf(struct dma_buf_attachment *attach,
 			   enum dma_data_direction dir);
 void *drm_gem_dmabuf_vmap(struct dma_buf *dma_buf);
 void drm_gem_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr);
-void *drm_gem_dmabuf_kmap_atomic(struct dma_buf *dma_buf,
-				 unsigned long page_num);
-void drm_gem_dmabuf_kunmap_atomic(struct dma_buf *dma_buf,
-				  unsigned long page_num, void *addr);
 void *drm_gem_dmabuf_kmap(struct dma_buf *dma_buf, unsigned long page_num);
 void drm_gem_dmabuf_kunmap(struct dma_buf *dma_buf, unsigned long page_num,
 			   void *addr);
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index c0ad5bf61188..58725f890b5b 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -205,8 +205,6 @@ struct dma_buf_ops {
 	 * to be restarted.
 	 */
 	int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction);
-	void *(*map_atomic)(struct dma_buf *, unsigned long);
-	void (*unmap_atomic)(struct dma_buf *, unsigned long, void *);
 	void *(*map)(struct dma_buf *, unsigned long);
 	void (*unmap)(struct dma_buf *, unsigned long, void *);
 
@@ -394,8 +392,6 @@ int dma_buf_begin_cpu_access(struct dma_buf *dma_buf,
 			     enum dma_data_direction dir);
 int dma_buf_end_cpu_access(struct dma_buf *dma_buf,
 			   enum dma_data_direction dir);
-void *dma_buf_kmap_atomic(struct dma_buf *, unsigned long);
-void dma_buf_kunmap_atomic(struct dma_buf *, unsigned long, void *);
 void *dma_buf_kmap(struct dma_buf *, unsigned long);
 void dma_buf_kunmap(struct dma_buf *, unsigned long, void *);
 
-- 
cgit v1.2.3


From c03cea42149de56fbae2301d7123daaa2cfe80e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Jun 2018 15:10:58 -0700
Subject: iomap: add initial support for writes without buffer heads

For now just limited to blocksize == PAGE_SIZE, where we can simply read
in the full page in write begin, and just set the whole page dirty after
copying data into it.  This code is enabled by default and XFS will now
be feed pages without buffer heads in ->writepage and ->writepages.

If a file system sets the IOMAP_F_BUFFER_HEAD flag on the iomap the old
path will still be used, this both helps the transition in XFS and
prepares for the gfs2 migration to the iomap infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 115 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_iomap.c    |   6 ++-
 include/linux/iomap.h |   2 +
 3 files changed, 114 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index 4f10c6b1cf6d..2ebff76039b5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -348,6 +348,48 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
 }
 
+static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
+		unsigned poff, unsigned plen, unsigned from, unsigned to,
+		struct iomap *iomap)
+{
+	struct bio_vec bvec;
+	struct bio bio;
+
+	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
+		zero_user_segments(page, poff, from, to, poff + plen);
+		return 0;
+	}
+
+	bio_init(&bio, &bvec, 1);
+	bio.bi_opf = REQ_OP_READ;
+	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+	bio_set_dev(&bio, iomap->bdev);
+	__bio_add_page(&bio, page, plen, poff);
+	return submit_bio_wait(&bio);
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+		struct page *page, struct iomap *iomap)
+{
+	loff_t block_size = i_blocksize(inode);
+	loff_t block_start = pos & ~(block_size - 1);
+	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+	unsigned poff = block_start & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
+	unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+
+	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+
+	if (PageUptodate(page))
+		return 0;
+	if (from <= poff && to >= poff + plen)
+		return 0;
+	return iomap_read_page_sync(inode, block_start, page,
+			poff, plen, from, to, iomap);
+}
+
 static int
 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, struct iomap *iomap)
@@ -367,9 +409,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 
 	if (iomap->type == IOMAP_INLINE)
 		iomap_read_inline_data(inode, page, iomap);
-	else
+	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
 		status = __block_write_begin_int(page, pos, len, NULL, iomap);
-
+	else
+		status = __iomap_write_begin(inode, pos, len, page, iomap);
 	if (unlikely(status)) {
 		unlock_page(page);
 		put_page(page);
@@ -382,6 +425,57 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 	return status;
 }
 
+int
+iomap_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int newly_dirty;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	/*
+	 * Lock out page->mem_cgroup migration to keep PageDirty
+	 * synchronized with per-memcg dirty page counters.
+	 */
+	lock_page_memcg(page);
+	newly_dirty = !TestSetPageDirty(page);
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 0);
+	unlock_page_memcg(page);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	return newly_dirty;
+}
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page, struct iomap *iomap)
+{
+	flush_dcache_page(page);
+
+	/*
+	 * The blocks that were entirely written will now be uptodate, so we
+	 * don't have to worry about a readpage reading them and overwriting a
+	 * partial write.  However if we have encountered a short write and only
+	 * partially written into a block, it will not be marked uptodate, so a
+	 * readpage might come in and destroy our partial write.
+	 *
+	 * Do the simplest thing, and just treat any short write to a non
+	 * uptodate page as a zero-length write, and force the caller to redo
+	 * the whole thing.
+	 */
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		copied = 0;
+	} else {
+		SetPageUptodate(page);
+		iomap_set_page_dirty(page);
+	}
+	return __generic_write_end(inode, pos, copied, page);
+}
+
 static int
 iomap_write_end_inline(struct inode *inode, struct page *page,
 		struct iomap *iomap, loff_t pos, unsigned copied)
@@ -408,9 +502,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 
 	if (iomap->type == IOMAP_INLINE) {
 		ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
-	} else {
+	} else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
 		ret = generic_write_end(NULL, inode->i_mapping, pos, len,
 				copied, page, NULL);
+	} else {
+		ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
 	}
 
 	if (iomap->page_done)
@@ -703,11 +799,16 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct page *page = data;
 	int ret;
 
-	ret = __block_write_begin_int(page, pos, length, NULL, iomap);
-	if (ret)
-		return ret;
+	if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
+		if (ret)
+			return ret;
+		block_commit_write(page, 0, length);
+	} else {
+		WARN_ON_ONCE(!PageUptodate(page));
+		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+	}
 
-	block_commit_write(page, 0, length);
 	return length;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 49f5492eed3b..8a3613d576af 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -626,7 +626,7 @@ retry:
 	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
 	 * them out if the write happens to fail.
 	 */
-	iomap->flags = IOMAP_F_NEW;
+	iomap->flags |= IOMAP_F_NEW;
 	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
 done:
 	if (isnullstartblock(got.br_startblock))
@@ -1019,6 +1019,8 @@ xfs_file_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
+	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+
 	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
 			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
@@ -1119,7 +1121,7 @@ xfs_file_iomap_begin(
 	if (error)
 		return error;
 
-	iomap->flags = IOMAP_F_NEW;
+	iomap->flags |= IOMAP_F_NEW;
 	trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 
 out_finish:
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 30d314407f66..5eb9ca8d7ce5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -30,6 +30,7 @@ struct vm_fault;
  */
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
 #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
+#define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
@@ -102,6 +103,7 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
+int iomap_set_page_dirty(struct page *page);
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
cgit v1.2.3


From 17dbca119312b4e8173d4e25ff64262119fcef38 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 13 Jun 2018 15:48:26 -0700
Subject: x86/speculation/l1tf: Add sysfs reporting for l1tf

L1TF core kernel workarounds are cheap and normally always enabled, However
they still should be reported in sysfs if the system is vulnerable or
mitigated. Add the necessary CPU feature/bug bits.

- Extend the existing checks for Meltdowns to determine if the system is
  vulnerable. All CPUs which are not vulnerable to Meltdown are also not
  vulnerable to L1TF

- Check for 32bit non PAE and emit a warning as there is no practical way
  for mitigation due to the limited physical address bits

- If the system has more than MAX_PA/2 physical memory the invert page
  workarounds don't protect the system against the L1TF attack anymore,
  because an inverted physical address will also point to valid
  memory. Print a warning in this case and report that the system is
  vulnerable.

Add a function which returns the PFN limit for the L1TF mitigation, which
will be used in follow up patches for sanity and range checks.

[ tglx: Renamed the CPU feature bit to L1TF_PTEINV ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
---
 arch/x86/include/asm/cpufeatures.h |  2 ++
 arch/x86/include/asm/processor.h   |  5 +++++
 arch/x86/kernel/cpu/bugs.c         | 40 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c       | 20 +++++++++++++++++++
 drivers/base/cpu.c                 |  8 ++++++++
 include/linux/cpu.h                |  2 ++
 6 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5701f5cecd31..f41cf9df4a83 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,6 +219,7 @@
 #define X86_FEATURE_IBPB		( 7*32+26) /* Indirect Branch Prediction Barrier */
 #define X86_FEATURE_STIBP		( 7*32+27) /* Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ZEN			( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV		( 7*32+29) /* "" L1TF workaround PTE inversion */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
@@ -373,5 +374,6 @@
 #define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 #define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF			X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cfd29ee8c3da..7e3ac5eedcd6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op;
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
+static inline unsigned long l1tf_pfn_limit(void)
+{
+	return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
+}
+
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index cd0fda1fff6d..3ce9674e4887 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -27,9 +27,11 @@
 #include <asm/pgtable.h>
 #include <asm/set_memory.h>
 #include <asm/intel-family.h>
+#include <asm/e820/api.h>
 
 static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
+static void __init l1tf_select_mitigation(void);
 
 /*
  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@@ -81,6 +83,8 @@ void __init check_bugs(void)
 	 */
 	ssb_select_mitigation();
 
+	l1tf_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -205,6 +209,32 @@ static void x86_amd_ssb_disable(void)
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
+static void __init l1tf_select_mitigation(void)
+{
+	u64 half_pa;
+
+	if (!boot_cpu_has_bug(X86_BUG_L1TF))
+		return;
+
+#if CONFIG_PGTABLE_LEVELS == 2
+	pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+	return;
+#endif
+
+	/*
+	 * This is extremely unlikely to happen because almost all
+	 * systems have far more MAX_PA/2 than RAM can be fit into
+	 * DIMM slots.
+	 */
+	half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+	if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+		pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+		return;
+	}
+
+	setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
 #ifdef RETPOLINE
 static bool spectre_v2_bad_module;
 
@@ -678,6 +708,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_SPEC_STORE_BYPASS:
 		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
 
+	case X86_BUG_L1TF:
+		if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+			return sprintf(buf, "Mitigation: Page Table Inversion\n");
+		break;
+
 	default:
 		break;
 	}
@@ -704,4 +739,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
 }
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0df7151cfef4..9baf684e2b55 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -984,6 +984,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
 	{}
 };
 
+static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
+	/* in addition to cpu_no_speculation */
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MOOREFIELD	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_DENVERTON	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GEMINI_LAKE	},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
+	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
+	{}
+};
+
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
@@ -1010,6 +1025,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 		return;
 
 	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+	if (x86_match_cpu(cpu_no_l1tf))
+		return;
+
+	setup_force_cpu_bug(X86_BUG_L1TF);
 }
 
 /*
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 30cc9c877ebb..eb9443d5bae1 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -540,16 +540,24 @@ ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_l1tf(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
 static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
+static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
 	&dev_attr_spectre_v1.attr,
 	&dev_attr_spectre_v2.attr,
 	&dev_attr_spec_store_bypass.attr,
+	&dev_attr_l1tf.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a97a63eef59f..d3da5184aa1c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -55,6 +55,8 @@ extern ssize_t cpu_show_spectre_v2(struct device *dev,
 				   struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
 					  struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_l1tf(struct device *dev,
+			     struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
-- 
cgit v1.2.3


From 377eeaa8e11fe815b1d07c81c4a0e2843a8c15eb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 13 Jun 2018 15:48:28 -0700
Subject: x86/speculation/l1tf: Limit swap file size to MAX_PA/2

For the L1TF workaround its necessary to limit the swap file size to below
MAX_PA/2, so that the higher bits of the swap offset inverted never point
to valid memory.

Add a mechanism for the architecture to override the swap file size check
in swapfile.c and add a x86 specific max swapfile check function that
enforces that limit.

The check is only enabled if the CPU is vulnerable to L1TF.

In VMs with 42bit MAX_PA the typical limit is 2TB now, on a native system
with 46bit PA it is 32TB. The limit is only per individual swap file, so
it's always possible to exceed these limits with multiple swap files or
partitions.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
---
 arch/x86/mm/init.c       | 15 +++++++++++++++
 include/linux/swapfile.h |  2 ++
 mm/swapfile.c            | 46 ++++++++++++++++++++++++++++++----------------
 3 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cee58a972cb2..1b530197d114 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -4,6 +4,8 @@
 #include <linux/swap.h>
 #include <linux/memblock.h>
 #include <linux/bootmem.h>	/* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
 
 #include <asm/set_memory.h>
 #include <asm/e820/api.h>
@@ -880,3 +882,16 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 	__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
 	__pte2cachemode_tbl[entry] = cache;
 }
+
+unsigned long max_swapfile_size(void)
+{
+	unsigned long pages;
+
+	pages = generic_max_swapfile_size();
+
+	if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+		/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+		pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages);
+	}
+	return pages;
+}
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 06bd7b096167..e06febf62978 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -10,5 +10,7 @@ extern spinlock_t swap_lock;
 extern struct plist_head swap_active_head;
 extern struct swap_info_struct *swap_info[];
 extern int try_to_unuse(unsigned int, bool, unsigned long);
+extern unsigned long generic_max_swapfile_size(void);
+extern unsigned long max_swapfile_size(void);
 
 #endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cc2972eedaf..18185ae4f223 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2909,6 +2909,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 	return 0;
 }
 
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+	return swp_offset(pte_to_swp_entry(
+			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+	return generic_max_swapfile_size();
+}
+
 static unsigned long read_swap_header(struct swap_info_struct *p,
 					union swap_header *swap_header,
 					struct inode *inode)
@@ -2944,22 +2973,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	p->cluster_next = 1;
 	p->cluster_nr = 0;
 
-	/*
-	 * Find out how many pages are allowed for a single swap
-	 * device. There are two limiting factors: 1) the number
-	 * of bits for the swap offset in the swp_entry_t type, and
-	 * 2) the number of bits in the swap pte as defined by the
-	 * different architectures. In order to find the
-	 * largest possible bit mask, a swap entry with swap type 0
-	 * and swap offset ~0UL is created, encoded to a swap pte,
-	 * decoded to a swp_entry_t again, and finally the swap
-	 * offset is extracted. This will mask all the bits from
-	 * the initial ~0UL mask that can't be encoded in either
-	 * the swp_entry_t or the architecture definition of a
-	 * swap pte.
-	 */
-	maxpages = swp_offset(pte_to_swp_entry(
-			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+	maxpages = max_swapfile_size();
 	last_page = swap_header->info.last_page;
 	if (!last_page) {
 		pr_warn("Empty swap-file\n");
-- 
cgit v1.2.3


From 5a6cf77f5e35e7af35d36a1e7dc21a42f6412e4f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 20 Jun 2018 01:05:07 +0900
Subject: kprobes: Remove jprobe API implementation

Remove functionally empty jprobe API implementations and test cases.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/lkml/152942430705.15209.2307050500995264322.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h |  3 --
 kernel/kprobes.c        | 78 +---------------------------------------
 kernel/test_kprobes.c   | 94 -------------------------------------------------
 lib/Kconfig.debug       |  2 +-
 4 files changed, 2 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9440a2fc8893..b520baa65682 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -389,9 +389,6 @@ int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int register_kprobes(struct kprobe **kps, int num);
 void unregister_kprobes(struct kprobe **kps, int num);
-int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
-int longjmp_break_handler(struct kprobe *, struct pt_regs *);
-void jprobe_return(void);
 unsigned long arch_deref_entry_point(void *);
 
 int register_kretprobe(struct kretprobe *rp);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ea619021d901..69de130595f7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1272,7 +1272,7 @@ NOKPROBE_SYMBOL(cleanup_rp_inst);
 
 /*
 * Add the new probe to ap->list. Fail if this is the
-* second jprobe at the address - two jprobes can't coexist
+* second break_handler at the address
 */
 static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
@@ -1812,77 +1812,6 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
-#if 0
-int register_jprobes(struct jprobe **jps, int num)
-{
-	int ret = 0, i;
-
-	if (num <= 0)
-		return -EINVAL;
-
-	for (i = 0; i < num; i++) {
-		ret = register_jprobe(jps[i]);
-
-		if (ret < 0) {
-			if (i > 0)
-				unregister_jprobes(jps, i);
-			break;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(register_jprobes);
-
-int register_jprobe(struct jprobe *jp)
-{
-	unsigned long addr, offset;
-	struct kprobe *kp = &jp->kp;
-
-	/*
-	 * Verify probepoint as well as the jprobe handler are
-	 * valid function entry points.
-	 */
-	addr = arch_deref_entry_point(jp->entry);
-
-	if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
-	    kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
-		kp->pre_handler = setjmp_pre_handler;
-		kp->break_handler = longjmp_break_handler;
-		return register_kprobe(kp);
-	}
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(register_jprobe);
-
-void unregister_jprobe(struct jprobe *jp)
-{
-	unregister_jprobes(&jp, 1);
-}
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-
-void unregister_jprobes(struct jprobe **jps, int num)
-{
-	int i;
-
-	if (num <= 0)
-		return;
-	mutex_lock(&kprobe_mutex);
-	for (i = 0; i < num; i++)
-		if (__unregister_kprobe_top(&jps[i]->kp) < 0)
-			jps[i]->kp.addr = NULL;
-	mutex_unlock(&kprobe_mutex);
-
-	synchronize_sched();
-	for (i = 0; i < num; i++) {
-		if (jps[i]->kp.addr)
-			__unregister_kprobe_bottom(&jps[i]->kp);
-	}
-}
-EXPORT_SYMBOL_GPL(unregister_jprobes);
-#endif
-
 #ifdef CONFIG_KRETPROBES
 /*
  * This kprobe pre_handler is registered with every kretprobe. When probe
@@ -2329,8 +2258,6 @@ static void report_probe(struct seq_file *pi, struct kprobe *p,
 
 	if (p->pre_handler == pre_handler_kretprobe)
 		kprobe_type = "r";
-	else if (p->pre_handler == setjmp_pre_handler)
-		kprobe_type = "j";
 	else
 		kprobe_type = "k";
 
@@ -2637,6 +2564,3 @@ late_initcall(debugfs_kprobe_init);
 #endif /* CONFIG_DEBUG_FS */
 
 module_init(init_kprobes);
-
-/* defined in arch/.../kernel/kprobes.c */
-EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index dd53e354f630..7bca480151b0 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -162,90 +162,6 @@ static int test_kprobes(void)
 
 }
 
-#if 0
-static u32 jph_val;
-
-static u32 j_kprobe_target(u32 value)
-{
-	if (preemptible()) {
-		handler_errors++;
-		pr_err("jprobe-handler is preemptible\n");
-	}
-	if (value != rand1) {
-		handler_errors++;
-		pr_err("incorrect value in jprobe handler\n");
-	}
-
-	jph_val = rand1;
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe jp = {
-	.entry		= j_kprobe_target,
-	.kp.symbol_name = "kprobe_target"
-};
-
-static int test_jprobe(void)
-{
-	int ret;
-
-	ret = register_jprobe(&jp);
-	if (ret < 0) {
-		pr_err("register_jprobe returned %d\n", ret);
-		return ret;
-	}
-
-	ret = target(rand1);
-	unregister_jprobe(&jp);
-	if (jph_val == 0) {
-		pr_err("jprobe handler not called\n");
-		handler_errors++;
-	}
-
-	return 0;
-}
-
-static struct jprobe jp2 = {
-	.entry          = j_kprobe_target,
-	.kp.symbol_name = "kprobe_target2"
-};
-
-static int test_jprobes(void)
-{
-	int ret;
-	struct jprobe *jps[2] = {&jp, &jp2};
-
-	/* addr and flags should be cleard for reusing kprobe. */
-	jp.kp.addr = NULL;
-	jp.kp.flags = 0;
-	ret = register_jprobes(jps, 2);
-	if (ret < 0) {
-		pr_err("register_jprobes returned %d\n", ret);
-		return ret;
-	}
-
-	jph_val = 0;
-	ret = target(rand1);
-	if (jph_val == 0) {
-		pr_err("jprobe handler not called\n");
-		handler_errors++;
-	}
-
-	jph_val = 0;
-	ret = target2(rand1);
-	if (jph_val == 0) {
-		pr_err("jprobe handler2 not called\n");
-		handler_errors++;
-	}
-	unregister_jprobes(jps, 2);
-
-	return 0;
-}
-#else
-#define test_jprobe() (0)
-#define test_jprobes() (0)
-#endif
 #ifdef CONFIG_KRETPROBES
 static u32 krph_val;
 
@@ -383,16 +299,6 @@ int init_test_probes(void)
 	if (ret < 0)
 		errors++;
 
-	num_tests++;
-	ret = test_jprobe();
-	if (ret < 0)
-		errors++;
-
-	num_tests++;
-	ret = test_jprobes();
-	if (ret < 0)
-		errors++;
-
 #ifdef CONFIG_KRETPROBES
 	num_tests++;
 	ret = test_kretprobe();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8838d1158d19..0b066b3c9284 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1718,7 +1718,7 @@ config KPROBES_SANITY_TEST
 	default n
 	help
 	  This option provides for testing basic kprobes functionality on
-	  boot. A sample kprobe, jprobe and kretprobe are inserted and
+	  boot. Samples of kprobe and kretprobe are inserted and
 	  verified for functionality.
 
 	  Say N if you are unsure.
-- 
cgit v1.2.3


From 4de58696de076d9bd2745d1cbe0930635c3f5ac9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 20 Jun 2018 01:17:15 +0900
Subject: kprobes: Remove jprobe stub API

Remove jprobe stub APIs from linux/kprobes.h since
the jprobe implementation was completely gone.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/lkml/152942503572.15209.1652552217914694917.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 50 -------------------------------------------------
 1 file changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index b520baa65682..e909413e4e38 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -63,7 +63,6 @@ struct pt_regs;
 struct kretprobe;
 struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
-typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
 typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
@@ -101,12 +100,6 @@ struct kprobe {
 	 */
 	kprobe_fault_handler_t fault_handler;
 
-	/*
-	 * ... called if breakpoint trap occurs in probe handler.
-	 * Return 1 if it handled break, otherwise kernel will see it.
-	 */
-	kprobe_break_handler_t break_handler;
-
 	/* Saved opcode (which has been replaced with breakpoint) */
 	kprobe_opcode_t opcode;
 
@@ -154,24 +147,6 @@ static inline int kprobe_ftrace(struct kprobe *p)
 	return p->flags & KPROBE_FLAG_FTRACE;
 }
 
-/*
- * Special probe type that uses setjmp-longjmp type tricks to resume
- * execution at a specified entry with a matching prototype corresponding
- * to the probed function - a trick to enable arguments to become
- * accessible seamlessly by probe handling logic.
- * Note:
- * Because of the way compilers allocate stack space for local variables
- * etc upfront, regardless of sub-scopes within a function, this mirroring
- * principle currently works only for probes placed on function entry points.
- */
-struct jprobe {
-	struct kprobe kp;
-	void *entry;	/* probe handling code to jump to */
-};
-
-/* For backward compatibility with old code using JPROBE_ENTRY() */
-#define JPROBE_ENTRY(handler)	(handler)
-
 /*
  * Function-return probe -
  * Note:
@@ -436,9 +411,6 @@ static inline void unregister_kprobe(struct kprobe *p)
 static inline void unregister_kprobes(struct kprobe **kps, int num)
 {
 }
-static inline void jprobe_return(void)
-{
-}
 static inline int register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
@@ -465,20 +437,6 @@ static inline int enable_kprobe(struct kprobe *kp)
 	return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
-static inline int register_jprobe(struct jprobe *p)
-{
-	return -ENOSYS;
-}
-static inline int register_jprobes(struct jprobe **jps, int num)
-{
-	return -ENOSYS;
-}
-static inline void unregister_jprobe(struct jprobe *p)
-{
-}
-static inline void unregister_jprobes(struct jprobe **jps, int num)
-{
-}
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
 	return disable_kprobe(&rp->kp);
@@ -487,14 +445,6 @@ static inline int enable_kretprobe(struct kretprobe *rp)
 {
 	return enable_kprobe(&rp->kp);
 }
-static inline int disable_jprobe(struct jprobe *jp)
-{
-	return -ENOSYS;
-}
-static inline int enable_jprobe(struct jprobe *jp)
-{
-	return -ENOSYS;
-}
 
 #ifndef CONFIG_KPROBES
 static inline bool is_kprobe_insn_slot(unsigned long addr)
-- 
cgit v1.2.3


From 8bd9cb51daac89337295b6f037b0486911e1b408 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 19 Jun 2018 13:53:08 +0100
Subject: locking/atomics, asm-generic: Move some macros from <linux/bitops.h>
 to a new <linux/bits.h> file

In preparation for implementing the asm-generic atomic bitops in terms
of atomic_long_*(), we need to prevent <asm/atomic.h> implementations from
pulling in <linux/bitops.h>. A common reason for this include is for the
BITS_PER_BYTE definition, so move this and some other BIT() and masking
macros into a new header file, <linux/bits.h>.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: yamada.masahiro@socionext.com
Link: https://lore.kernel.org/lkml/1529412794-17720-4-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/bitops.h | 22 +---------------------
 include/linux/bits.h   | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 21 deletions(-)
 create mode 100644 include/linux/bits.h

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 4cac4e1a72ff..af419012d77d 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -2,29 +2,9 @@
 #ifndef _LINUX_BITOPS_H
 #define _LINUX_BITOPS_H
 #include <asm/types.h>
+#include <linux/bits.h>
 
-#ifdef	__KERNEL__
-#define BIT(nr)			(1UL << (nr))
-#define BIT_ULL(nr)		(1ULL << (nr))
-#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
-#define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
-#define BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
-#define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
-#define BITS_PER_BYTE		8
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
-#endif
-
-/*
- * Create a contiguous bitmask starting at bit position @l and ending at
- * position @h. For example
- * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
- */
-#define GENMASK(h, l) \
-	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
-
-#define GENMASK_ULL(h, l) \
-	(((~0ULL) - (1ULL << (l)) + 1) & \
-	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
diff --git a/include/linux/bits.h b/include/linux/bits.h
new file mode 100644
index 000000000000..2b7b532c1d51
--- /dev/null
+++ b/include/linux/bits.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BITS_H
+#define __LINUX_BITS_H
+#include <asm/bitsperlong.h>
+
+#define BIT(nr)			(1UL << (nr))
+#define BIT_ULL(nr)		(1ULL << (nr))
+#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
+#define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
+#define BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
+#define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
+#define BITS_PER_BYTE		8
+
+/*
+ * Create a contiguous bitmask starting at bit position @l and ending at
+ * position @h. For example
+ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
+ */
+#define GENMASK(h, l) \
+	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+
+#endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3


From 05736e4ac13c08a4a9b1ef2de26dd31a32cbee57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 29 May 2018 17:48:27 +0200
Subject: cpu/hotplug: Provide knobs to control SMT

Provide a command line and a sysfs knob to control SMT.

The command line options are:

 'nosmt':	Enumerate secondary threads, but do not online them

 'nosmt=force': Ignore secondary threads completely during enumeration
 		via MP table and ACPI/MADT.

The sysfs control file has the following states (read/write):

 'on':		 SMT is enabled. Secondary threads can be freely onlined
 'off':		 SMT is disabled. Secondary threads, even if enumerated
 		 cannot be onlined
 'forceoff':	 SMT is permanentely disabled. Writes to the control
 		 file are rejected.
 'notsupported': SMT is not supported by the CPU

The command line option 'nosmt' sets the sysfs control to 'off'. This
can be changed to 'on' to reenable SMT during runtime.

The command line option 'nosmt=force' sets the sysfs control to
'forceoff'. This cannot be changed during runtime.

When SMT is 'on' and the control file is changed to 'off' then all online
secondary threads are offlined and attempts to online a secondary thread
later on are rejected.

When SMT is 'off' and the control file is changed to 'on' then secondary
threads can be onlined again. The 'off' -> 'on' transition does not
automatically online the secondary threads.

When the control file is set to 'forceoff', the behaviour is the same as
setting it to 'off', but the operation is irreversible and later writes to
the control file are rejected.

When the control status is 'notsupported' then writes to the control file
are rejected.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  20 +++
 Documentation/admin-guide/kernel-parameters.txt    |   8 +
 arch/Kconfig                                       |   3 +
 arch/x86/Kconfig                                   |   1 +
 include/linux/cpu.h                                |  13 ++
 kernel/cpu.c                                       | 170 +++++++++++++++++++++
 6 files changed, 215 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 9c5e7732d249..65d9b844ecfd 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -487,3 +487,23 @@ Description:	Information about CPU vulnerabilities
 		"Not affected"	  CPU is not affected by the vulnerability
 		"Vulnerable"	  CPU is affected and no mitigation in effect
 		"Mitigation: $M"  CPU is affected and mitigation $M is in effect
+
+What:		/sys/devices/system/cpu/smt
+		/sys/devices/system/cpu/smt/active
+		/sys/devices/system/cpu/smt/control
+Date:		June 2018
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Control Symetric Multi Threading (SMT)
+
+		active:  Tells whether SMT is active (enabled and siblings online)
+
+		control: Read/write interface to control SMT. Possible
+			 values:
+
+			 "on"		SMT is enabled
+			 "off"		SMT is disabled
+			 "forceoff"	SMT is force disabled. Cannot be changed.
+			 "notsupported" SMT is not supported by the CPU
+
+			 If control status is "forceoff" or "notsupported" writes
+			 are rejected.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index efc7aa7a0670..8e29c4b6756f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2687,6 +2687,14 @@
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
 
+			[KNL,x86] Disable symmetric multithreading (SMT).
+			nosmt=force: Force disable SMT, similar to disabling
+				     it in the BIOS except that some of the
+				     resource partitioning effects which are
+				     caused by having SMT enabled in the BIOS
+				     cannot be undone. Depending on the CPU
+				     type this might have a performance impact.
+
 	nospectre_v2	[X86] Disable all mitigations for the Spectre variant 2
 			(indirect branch prediction) vulnerability. System may
 			allow data leaks with this option, which is equivalent
diff --git a/arch/Kconfig b/arch/Kconfig
index 1aa59063f1fd..d1f2ed462ac8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -13,6 +13,9 @@ config KEXEC_CORE
 config HAVE_IMA_KEXEC
 	bool
 
+config HOTPLUG_SMT
+	bool
+
 config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f1dbb4ee19d7..7a34fdf8daf0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -187,6 +187,7 @@ config X86
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
+	select HOTPLUG_SMT			if SMP
 	select IRQ_FORCED_THREADING
 	select NEED_SG_DMA_LENGTH
 	select PCI_LOCKLESS_CONFIG
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d3da5184aa1c..7532cbf27b1d 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -168,4 +168,17 @@ void cpuhp_report_idle_dead(void);
 static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
+enum cpuhp_smt_control {
+	CPU_SMT_ENABLED,
+	CPU_SMT_DISABLED,
+	CPU_SMT_FORCE_DISABLED,
+	CPU_SMT_NOT_SUPPORTED,
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
+extern enum cpuhp_smt_control cpu_smt_control;
+#else
+# define cpu_smt_control		(CPU_SMT_ENABLED)
+#endif
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e266cb529c1a..d29fdd7e57bb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -933,6 +933,29 @@ EXPORT_SYMBOL(cpu_down);
 #define takedown_cpu		NULL
 #endif /*CONFIG_HOTPLUG_CPU*/
 
+#ifdef CONFIG_HOTPLUG_SMT
+enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+
+static int __init smt_cmdline_disable(char *str)
+{
+	cpu_smt_control = CPU_SMT_DISABLED;
+	if (str && !strcmp(str, "force")) {
+		pr_info("SMT: Force disabled\n");
+		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+	}
+	return 0;
+}
+early_param("nosmt", smt_cmdline_disable);
+
+static inline bool cpu_smt_allowed(unsigned int cpu)
+{
+	return cpu_smt_control == CPU_SMT_ENABLED ||
+		topology_is_primary_thread(cpu);
+}
+#else
+static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+#endif
+
 /**
  * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
  * @cpu: cpu that just started
@@ -1056,6 +1079,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
 		err = -EBUSY;
 		goto out;
 	}
+	if (!cpu_smt_allowed(cpu)) {
+		err = -EPERM;
+		goto out;
+	}
 
 	err = _cpu_up(cpu, 0, target);
 out:
@@ -1904,10 +1931,153 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
 	NULL
 };
 
+#ifdef CONFIG_HOTPLUG_SMT
+
+static const char *smt_states[] = {
+	[CPU_SMT_ENABLED]		= "on",
+	[CPU_SMT_DISABLED]		= "off",
+	[CPU_SMT_FORCE_DISABLED]	= "forceoff",
+	[CPU_SMT_NOT_SUPPORTED]		= "notsupported",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
+}
+
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+
+	dev->offline = true;
+	/* Tell user space about the state change */
+	kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+	int cpu, ret = 0;
+
+	cpu_maps_update_begin();
+	for_each_online_cpu(cpu) {
+		if (topology_is_primary_thread(cpu))
+			continue;
+		ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+		if (ret)
+			break;
+		/*
+		 * As this needs to hold the cpu maps lock it's impossible
+		 * to call device_offline() because that ends up calling
+		 * cpu_down() which takes cpu maps lock. cpu maps lock
+		 * needs to be held as this might race against in kernel
+		 * abusers of the hotplug machinery (thermal management).
+		 *
+		 * So nothing would update device:offline state. That would
+		 * leave the sysfs entry stale and prevent onlining after
+		 * smt control has been changed to 'off' again. This is
+		 * called under the sysfs hotplug lock, so it is properly
+		 * serialized against the regular offline usage.
+		 */
+		cpuhp_offline_cpu_device(cpu);
+	}
+	if (!ret)
+		cpu_smt_control = ctrlval;
+	cpu_maps_update_done();
+	return ret;
+}
+
+static void cpuhp_smt_enable(void)
+{
+	cpu_maps_update_begin();
+	cpu_smt_control = CPU_SMT_ENABLED;
+	cpu_maps_update_done();
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+		  const char *buf, size_t count)
+{
+	int ctrlval, ret;
+
+	if (sysfs_streq(buf, "on"))
+		ctrlval = CPU_SMT_ENABLED;
+	else if (sysfs_streq(buf, "off"))
+		ctrlval = CPU_SMT_DISABLED;
+	else if (sysfs_streq(buf, "forceoff"))
+		ctrlval = CPU_SMT_FORCE_DISABLED;
+	else
+		return -EINVAL;
+
+	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+		return -EPERM;
+
+	if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+		return -ENODEV;
+
+	ret = lock_device_hotplug_sysfs();
+	if (ret)
+		return ret;
+
+	if (ctrlval != cpu_smt_control) {
+		switch (ctrlval) {
+		case CPU_SMT_ENABLED:
+			cpuhp_smt_enable();
+			break;
+		case CPU_SMT_DISABLED:
+		case CPU_SMT_FORCE_DISABLED:
+			ret = cpuhp_smt_disable(ctrlval);
+			break;
+		}
+	}
+
+	unlock_device_hotplug();
+	return ret ? ret : count;
+}
+static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+
+static ssize_t
+show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	bool active = topology_max_smt_threads() > 1;
+
+	return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+}
+static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+
+static struct attribute *cpuhp_smt_attrs[] = {
+	&dev_attr_control.attr,
+	&dev_attr_active.attr,
+	NULL
+};
+
+static const struct attribute_group cpuhp_smt_attr_group = {
+	.attrs = cpuhp_smt_attrs,
+	.name = "smt",
+	NULL
+};
+
+static int __init cpu_smt_state_init(void)
+{
+	if (!topology_smt_supported())
+		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
+				  &cpuhp_smt_attr_group);
+}
+
+#else
+static inline int cpu_smt_state_init(void) { return 0; }
+#endif
+
 static int __init cpuhp_sysfs_init(void)
 {
 	int cpu, ret;
 
+	ret = cpu_smt_state_init();
+	if (ret)
+		return ret;
+
 	ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
 				 &cpuhp_cpu_root_attr_group);
 	if (ret)
-- 
cgit v1.2.3


From bfc18e389c7a09fbbbed6bf4032396685b14246e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:04 +0100
Subject: atomics/treewide: Rename __atomic_add_unless() =>
 atomic_fetch_add_unless()

While __atomic_add_unless() was originally intended as a building-block
for atomic_add_unless(), it's now used in a number of places around the
kernel. It's the only common atomic operation named __atomic*(), rather
than atomic_*(), and for consistency it would be better named
atomic_fetch_add_unless().

This lack of consistency is slightly confusing, and gets in the way of
scripting atomics. Given that, let's clean things up and promote it to
an official part of the atomics API, in the form of
atomic_fetch_add_unless().

This patch converts definitions and invocations over to the new name,
including the instrumented version, using the following script:

  ----
  git grep -w __atomic_add_unless | while read line; do
  sed -i '{s/\<__atomic_add_unless\>/atomic_fetch_add_unless/}' "${line%%:*}";
  done
  git grep -w __arch_atomic_add_unless | while read line; do
  sed -i '{s/\<__arch_atomic_add_unless\>/arch_atomic_fetch_add_unless/}' "${line%%:*}";
  done
  ----

Note that we do not have atomic{64,_long}_fetch_add_unless(), which will
be introduced by later patches.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-2-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           | 4 ++--
 arch/arc/include/asm/atomic.h             | 4 ++--
 arch/arm/include/asm/atomic.h             | 4 ++--
 arch/arm64/include/asm/atomic.h           | 2 +-
 arch/h8300/include/asm/atomic.h           | 2 +-
 arch/hexagon/include/asm/atomic.h         | 4 ++--
 arch/ia64/include/asm/atomic.h            | 2 +-
 arch/m68k/include/asm/atomic.h            | 2 +-
 arch/mips/include/asm/atomic.h            | 4 ++--
 arch/openrisc/include/asm/atomic.h        | 4 ++--
 arch/parisc/include/asm/atomic.h          | 4 ++--
 arch/powerpc/include/asm/atomic.h         | 8 ++++----
 arch/riscv/include/asm/atomic.h           | 4 ++--
 arch/s390/include/asm/atomic.h            | 2 +-
 arch/sh/include/asm/atomic.h              | 4 ++--
 arch/sparc/include/asm/atomic_32.h        | 2 +-
 arch/sparc/include/asm/atomic_64.h        | 2 +-
 arch/sparc/lib/atomic32.c                 | 4 ++--
 arch/x86/include/asm/atomic.h             | 4 ++--
 arch/xtensa/include/asm/atomic.h          | 4 ++--
 drivers/block/rbd.c                       | 2 +-
 drivers/infiniband/core/rdma_core.c       | 2 +-
 fs/afs/rxrpc.c                            | 2 +-
 include/asm-generic/atomic-instrumented.h | 4 ++--
 include/asm-generic/atomic.h              | 4 ++--
 include/linux/atomic.h                    | 2 +-
 kernel/bpf/syscall.c                      | 4 ++--
 net/rxrpc/call_object.c                   | 2 +-
 net/rxrpc/conn_object.c                   | 4 ++--
 net/rxrpc/local_object.c                  | 2 +-
 net/rxrpc/peer_object.c                   | 2 +-
 31 files changed, 50 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 767bfdd42992..392b15a4dd4f 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -206,7 +206,7 @@ ATOMIC_OPS(xor, xor)
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -214,7 +214,7 @@ ATOMIC_OPS(xor, xor)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, new, old;
 	smp_mb();
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 11859287c52a..67121b5ff3a3 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -309,7 +309,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
 #undef ATOMIC_OP
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -317,7 +317,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v
  */
-#define __atomic_add_unless(v, a, u)					\
+#define atomic_fetch_add_unless(v, a, u)					\
 ({									\
 	int c, old;							\
 									\
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 66d0e215a773..9d56d0727c9b 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -130,7 +130,7 @@ static inline int atomic_cmpxchg_relaxed(atomic_t *ptr, int old, int new)
 }
 #define atomic_cmpxchg_relaxed		atomic_cmpxchg_relaxed
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int oldval, newval;
 	unsigned long tmp;
@@ -215,7 +215,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index c0235e0ff849..264d20339f74 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -125,7 +125,7 @@
 #define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
 #define atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
 #define atomic_add_negative(i, v)	(atomic_add_return((i), (v)) < 0)
-#define __atomic_add_unless(v, a, u)	___atomic_add_unless(v, a, u,)
+#define atomic_fetch_add_unless(v, a, u)	___atomic_add_unless(v, a, u,)
 #define atomic_andnot			atomic_andnot
 
 /*
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index b174dec099bf..5c856887fdf2 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -94,7 +94,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
 	h8300flags flags;
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index fb3dfb2a667e..287aa9f394f3 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -164,7 +164,7 @@ ATOMIC_OPS(xor)
 #undef ATOMIC_OP
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer to value
  * @a: amount to add
  * @u: unless value is equal to u
@@ -173,7 +173,7 @@ ATOMIC_OPS(xor)
  *
  */
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int __oldval;
 	register int tmp;
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 2524fb60fbc2..9d2ddde5f9d5 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -215,7 +215,7 @@ ATOMIC64_FETCH_OP(xor, ^)
 	(cmpxchg(&((v)->counter), old, new))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index e993e2860ee1..8022d9ea1213 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -211,7 +211,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
 	return c != 0;
 }
 
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 0ab176bdb8e8..02fc1553cf9b 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -275,7 +275,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
 #define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -283,7 +283,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/openrisc/include/asm/atomic.h b/arch/openrisc/include/asm/atomic.h
index 146e1660f00e..b589fac39b92 100644
--- a/arch/openrisc/include/asm/atomic.h
+++ b/arch/openrisc/include/asm/atomic.h
@@ -100,7 +100,7 @@ ATOMIC_OP(xor)
  *
  * This is often used through atomic_inc_not_zero()
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int old, tmp;
 
@@ -119,7 +119,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 
 	return old;
 }
-#define __atomic_add_unless	__atomic_add_unless
+#define atomic_fetch_add_unless	atomic_fetch_add_unless
 
 #include <asm-generic/atomic.h>
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 88bae6676c9b..7748abced766 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -78,7 +78,7 @@ static __inline__ int atomic_read(const atomic_t *v)
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -86,7 +86,7 @@ static __inline__ int atomic_read(const atomic_t *v)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 682b3e6a1e21..1483261080a1 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -218,7 +218,7 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
 #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -226,13 +226,13 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int t;
 
 	__asm__ __volatile__ (
 	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%1		# __atomic_add_unless\n\
+"1:	lwarx	%0,0,%1		# atomic_fetch_add_unless\n\
 	cmpw	0,%0,%3 \n\
 	beq	2f \n\
 	add	%0,%2,%0 \n"
@@ -538,7 +538,7 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
 
 	__asm__ __volatile__ (
 	PPC_ATOMIC_ENTRY_BARRIER
-"1:	ldarx	%0,0,%1		# __atomic_add_unless\n\
+"1:	ldarx	%0,0,%1		# atomic_fetch_add_unless\n\
 	cmpd	0,%0,%3 \n\
 	beq	2f \n\
 	add	%0,%2,%0 \n"
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 855115ace98c..739e810c857e 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -332,7 +332,7 @@ ATOMIC_OP(dec_and_test, dec, ==, 0, 64)
 #undef ATOMIC_OP
 
 /* This is required to provide a full barrier on success. */
-static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
        int prev, rc;
 
@@ -381,7 +381,7 @@ static __always_inline int atomic64_add_unless(atomic64_t *v, long a, long u)
  */
 static __always_inline int atomic_inc_not_zero(atomic_t *v)
 {
-        return __atomic_add_unless(v, 1, 0);
+        return atomic_fetch_add_unless(v, 1, 0);
 }
 
 #ifndef CONFIG_GENERIC_ATOMIC64
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 4b55532f15c4..c2858cdd8c29 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -90,7 +90,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return __atomic_cmpxchg(&v->counter, old, new);
 }
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 0fd0099f43cc..ef45931ebac5 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -46,7 +46,7 @@
 #define atomic_cmpxchg(v, o, n)		(cmpxchg(&((v)->counter), (o), (n)))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -54,7 +54,7 @@
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index d13ce517f4b9..a58f4b43bcc7 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -27,7 +27,7 @@ int atomic_fetch_or(int, atomic_t *);
 int atomic_fetch_xor(int, atomic_t *);
 int atomic_cmpxchg(atomic_t *, int, int);
 int atomic_xchg(atomic_t *, int);
-int __atomic_add_unless(atomic_t *, int, int);
+int atomic_fetch_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
 
 #define atomic_set_release(v, i)	atomic_set((v), (i))
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 28db058d471b..f416fd3d2708 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -89,7 +89,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 465a901a0ada..281fa634bb1a 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -95,7 +95,7 @@ int atomic_cmpxchg(atomic_t *v, int old, int new)
 }
 EXPORT_SYMBOL(atomic_cmpxchg);
 
-int __atomic_add_unless(atomic_t *v, int a, int u)
+int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
 	unsigned long flags;
@@ -107,7 +107,7 @@ int __atomic_add_unless(atomic_t *v, int a, int u)
 	spin_unlock_irqrestore(ATOMIC_HASH(v), flags);
 	return ret;
 }
-EXPORT_SYMBOL(__atomic_add_unless);
+EXPORT_SYMBOL(atomic_fetch_add_unless);
 
 /* Atomic operations are already serializing */
 void atomic_set(atomic_t *v, int i)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 0db6bec95489..84ed0bd76aef 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -254,7 +254,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
 }
 
 /**
- * __arch_atomic_add_unless - add unless the number is already a given value
+ * arch_atomic_fetch_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -262,7 +262,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Returns the old value of @v.
  */
-static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c = arch_atomic_read(v);
 
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index e7a23f2a519a..4188e56c06c9 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -275,7 +275,7 @@ ATOMIC_OPS(xor)
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 /**
- * __atomic_add_unless - add unless the number is a given value
+ * atomic_fetch_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
@@ -283,7 +283,7 @@ ATOMIC_OPS(xor)
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns the old value of @v.
  */
-static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
+static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fa0729c1e776..d81c653b9bf6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -61,7 +61,7 @@ static int atomic_inc_return_safe(atomic_t *v)
 {
 	unsigned int counter;
 
-	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
+	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
 	if (counter <= (unsigned int)INT_MAX)
 		return (int)counter;
 
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index a6e904973ba8..475910ffbcb6 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -121,7 +121,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive)
 	 * this lock.
 	 */
 	if (!exclusive)
-		return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ?
+		return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ?
 			-EBUSY : 0;
 
 	/* lock is either WRITE or DESTROY - should be exclusive */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a1b18082991b..183cc5418722 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -648,7 +648,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 	trace_afs_notify_call(rxcall, call);
 	call->need_attention = true;
 
-	u = __atomic_add_unless(&call->usage, 1, 0);
+	u = atomic_fetch_add_unless(&call->usage, 1, 0);
 	if (u != 0) {
 		trace_afs_call(call, afs_call_trace_wake, u,
 			       atomic_read(&call->net->nr_outstanding_calls),
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index ec07f23678ea..b8b14cc2df6c 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -84,10 +84,10 @@ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 ne
 }
 #endif
 
-static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	kasan_check_write(v, sizeof(*v));
-	return __arch_atomic_add_unless(v, a, u);
+	return arch_atomic_fetch_add_unless(v, a, u);
 }
 
 
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index abe6dd9ca2a8..10051ed6d088 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -221,8 +221,8 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
-#ifndef __atomic_add_unless
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+#ifndef atomic_fetch_add_unless
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 01ce3997cb42..9cc982936675 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -530,7 +530,7 @@
  */
 static inline int atomic_add_unless(atomic_t *v, int a, int u)
 {
-	return __atomic_add_unless(v, a, u) != u;
+	return atomic_fetch_add_unless(v, a, u) != u;
 }
 
 /**
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..f12db70d3bf3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -575,7 +575,7 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
 {
 	int refold;
 
-	refold = __atomic_add_unless(&map->refcnt, 1, 0);
+	refold = atomic_fetch_add_unless(&map->refcnt, 1, 0);
 
 	if (refold >= BPF_MAX_REFCNT) {
 		__bpf_map_put(map, false);
@@ -1142,7 +1142,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 {
 	int refold;
 
-	refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+	refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0);
 
 	if (refold >= BPF_MAX_REFCNT) {
 		__bpf_prog_put(prog, false);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f6734d8cb01a..9486293fef5c 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -415,7 +415,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
 bool rxrpc_queue_call(struct rxrpc_call *call)
 {
 	const void *here = __builtin_return_address(0);
-	int n = __atomic_add_unless(&call->usage, 1, 0);
+	int n = atomic_fetch_add_unless(&call->usage, 1, 0);
 	if (n == 0)
 		return false;
 	if (rxrpc_queue_work(&call->processor))
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 4c77a78a252a..77440a356b14 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -266,7 +266,7 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn)
 bool rxrpc_queue_conn(struct rxrpc_connection *conn)
 {
 	const void *here = __builtin_return_address(0);
-	int n = __atomic_add_unless(&conn->usage, 1, 0);
+	int n = atomic_fetch_add_unless(&conn->usage, 1, 0);
 	if (n == 0)
 		return false;
 	if (rxrpc_queue_work(&conn->processor))
@@ -309,7 +309,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
 	const void *here = __builtin_return_address(0);
 
 	if (conn) {
-		int n = __atomic_add_unless(&conn->usage, 1, 0);
+		int n = atomic_fetch_add_unless(&conn->usage, 1, 0);
 		if (n > 0)
 			trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
 		else
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index b493e6b62740..777c3ed4cfc0 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -305,7 +305,7 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
 	const void *here = __builtin_return_address(0);
 
 	if (local) {
-		int n = __atomic_add_unless(&local->usage, 1, 0);
+		int n = atomic_fetch_add_unless(&local->usage, 1, 0);
 		if (n > 0)
 			trace_rxrpc_local(local, rxrpc_local_got, n + 1, here);
 		else
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 1b7e8107b3ae..1cf3b408017a 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -406,7 +406,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
 	const void *here = __builtin_return_address(0);
 
 	if (peer) {
-		int n = __atomic_add_unless(&peer->usage, 1, 0);
+		int n = atomic_fetch_add_unless(&peer->usage, 1, 0);
 		if (n > 0)
 			trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here);
 		else
-- 
cgit v1.2.3


From f74445b6dd6b8f33ed34e005d19ecbb49171dabf Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:06 +0100
Subject: atomics/treewide: Remove atomic_inc_not_zero_hint()

While documentation suggests atomic_inc_not_zero_hint() will perform better
than atomic_inc_not_zero(), this is unlikely to be the case. No architectures
implement atomic_inc_not_zero_hint() directly, and thus it either falls back to
atomic_inc_not_zero(), or a loop using atomic_cmpxchg().

Whenever the hint does not match the value in memory, the repeated use of
atomic_cmpxchg() will be more expensive than the read that
atomic_inc_not_zero_hint() attempts to avoid. For architectures with LL/SC
atomics, a read cannot be avoided, and it would always be better to use
atomic_inc_not_zero() directly. For other architectures, their own
atomic_inc_not_zero() is likely to be more optimal than an atomic_cmpxchg()
loop regardless.

Generally, atomic_inc_not_zero_hint() is liable to perform worse than
atomic_inc_not_zero(). Further, atomic_inc_not_zero_hint() only exists
for atomic_t, and not atomic64_t or atomic_long_t, and there is only one
user in the kernel tree.

Given all this, let's remove atomic_inc_not_zero_hint(), and migrate the
existing user over to atomic_inc_not_zero().

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-4-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/atomic.h | 32 --------------------------------
 net/atm/pppoatm.c      |  2 +-
 2 files changed, 1 insertion(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 9cc982936675..5c5620ae5a35 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -571,38 +571,6 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v)
 }
 #endif
 
-/**
- * atomic_inc_not_zero_hint - increment if not null
- * @v: pointer of type atomic_t
- * @hint: probable value of the atomic before the increment
- *
- * This version of atomic_inc_not_zero() gives a hint of probable
- * value of the atomic. This helps processor to not read the memory
- * before doing the atomic read/modify/write cycle, lowering
- * number of bus transactions on some arches.
- *
- * Returns: 0 if increment was not done, 1 otherwise.
- */
-#ifndef atomic_inc_not_zero_hint
-static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint)
-{
-	int val, c = hint;
-
-	/* sanity test, should be removed by compiler if hint is a constant */
-	if (!hint)
-		return atomic_inc_not_zero(v);
-
-	do {
-		val = atomic_cmpxchg(v, c, c + 1);
-		if (val == c)
-			return 1;
-		c = val;
-	} while (c);
-
-	return 0;
-}
-#endif
-
 #ifndef atomic_inc_unless_negative
 static inline int atomic_inc_unless_negative(atomic_t *p)
 {
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index af8c4b38b746..d84227d75717 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -244,7 +244,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size)
 	 * the packet count limit, so...
 	 */
 	if (atm_may_send(pvcc->atmvcc, size) &&
-	    atomic_inc_not_zero_hint(&pvcc->inflight, NONE_INFLIGHT))
+	    atomic_inc_not_zero(&pvcc->inflight))
 		return 1;
 
 	/*
-- 
cgit v1.2.3


From ade5ef9280c33993099199c51e2e27c2c4013afd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:07 +0100
Subject: atomics: Make conditional ops return 'bool'

Some of the atomics return a status value, which is a boolean value
describing whether the operation was performed. To make it clear that
this is a boolean value, let's update the common fallbacks to return
bool, fixing up the return values and comments likewise.

At the same time, let's simplify the description of the operations in
their respective comments.

The instrumented atomics and generic atomic64 implementation are updated
accordingly.

Note that atomic64_dec_if_positive() doesn't follow the usual test op
pattern, and returns the would-be decremented value. This is not
changed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-5-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/atomic-instrumented.h |  2 +-
 include/asm-generic/atomic64.h            |  3 ++-
 include/linux/atomic.h                    | 24 +++++++++++++-----------
 lib/atomic64.c                            |  6 +++---
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index b8b14cc2df6c..497faa4a05e3 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -205,7 +205,7 @@ static __always_inline s64 atomic64_dec_return(atomic64_t *v)
 	return arch_atomic64_dec_return(v);
 }
 
-static __always_inline s64 atomic64_inc_not_zero(atomic64_t *v)
+static __always_inline bool atomic64_inc_not_zero(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_inc_not_zero(v);
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index 8d28eb010d0d..a951a721e1bb 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -11,6 +11,7 @@
  */
 #ifndef _ASM_GENERIC_ATOMIC64_H
 #define _ASM_GENERIC_ATOMIC64_H
+#include <linux/types.h>
 
 typedef struct {
 	long long counter;
@@ -52,7 +53,7 @@ ATOMIC64_OPS(xor)
 extern long long atomic64_dec_if_positive(atomic64_t *v);
 extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
 extern long long atomic64_xchg(atomic64_t *v, long long new);
-extern int	 atomic64_add_unless(atomic64_t *v, long long a, long long u);
+extern bool	 atomic64_add_unless(atomic64_t *v, long long a, long long u);
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 5c5620ae5a35..307a7f6d619a 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -2,6 +2,8 @@
 /* Atomic operations usable in machine independent code */
 #ifndef _LINUX_ATOMIC_H
 #define _LINUX_ATOMIC_H
+#include <linux/types.h>
+
 #include <asm/atomic.h>
 #include <asm/barrier.h>
 
@@ -525,10 +527,10 @@
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
  *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Atomically adds @a to @v, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
+static inline bool atomic_add_unless(atomic_t *v, int a, int u)
 {
 	return atomic_fetch_add_unless(v, a, u) != u;
 }
@@ -537,8 +539,8 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
  * atomic_inc_not_zero - increment unless the number is zero
  * @v: pointer of type atomic_t
  *
- * Atomically increments @v by 1, so long as @v is non-zero.
- * Returns non-zero if @v was non-zero, and zero otherwise.
+ * Atomically increments @v by 1, if @v is non-zero.
+ * Returns true if the increment was done.
  */
 #ifndef atomic_inc_not_zero
 #define atomic_inc_not_zero(v)		atomic_add_unless((v), 1, 0)
@@ -572,28 +574,28 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v)
 #endif
 
 #ifndef atomic_inc_unless_negative
-static inline int atomic_inc_unless_negative(atomic_t *p)
+static inline bool atomic_inc_unless_negative(atomic_t *p)
 {
 	int v, v1;
 	for (v = 0; v >= 0; v = v1) {
 		v1 = atomic_cmpxchg(p, v, v + 1);
 		if (likely(v1 == v))
-			return 1;
+			return true;
 	}
-	return 0;
+	return false;
 }
 #endif
 
 #ifndef atomic_dec_unless_positive
-static inline int atomic_dec_unless_positive(atomic_t *p)
+static inline bool atomic_dec_unless_positive(atomic_t *p)
 {
 	int v, v1;
 	for (v = 0; v <= 0; v = v1) {
 		v1 = atomic_cmpxchg(p, v, v - 1);
 		if (likely(v1 == v))
-			return 1;
+			return true;
 	}
-	return 0;
+	return false;
 }
 #endif
 
diff --git a/lib/atomic64.c b/lib/atomic64.c
index 53c2d5edc826..4230f4b8906c 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c
@@ -178,16 +178,16 @@ long long atomic64_xchg(atomic64_t *v, long long new)
 }
 EXPORT_SYMBOL(atomic64_xchg);
 
-int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+bool atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
 	unsigned long flags;
 	raw_spinlock_t *lock = lock_addr(v);
-	int ret = 0;
+	bool ret = false;
 
 	raw_spin_lock_irqsave(lock, flags);
 	if (v->counter != u) {
 		v->counter += a;
-		ret = 1;
+		ret = true;
 	}
 	raw_spin_unlock_irqrestore(lock, flags);
 	return ret;
-- 
cgit v1.2.3


From bef828204a1bc7a0fd3a24551c4265e9c2ab95ed Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:08 +0100
Subject: atomics/treewide: Make atomic64_inc_not_zero() optional

We define a trivial fallback for atomic_inc_not_zero(), but don't do
the same for atomic64_inc_not_zero(), leading most architectures to
define the same boilerplate.

Let's add a fallback in <linux/atomic.h>, and remove the redundant
implementations. Note that atomic64_add_unless() is always defined in
<linux/atomic.h>, and promotes its arguments to the requisite types, so
we need not do this explicitly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-6-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           |  2 --
 arch/arc/include/asm/atomic.h             |  1 -
 arch/arm/include/asm/atomic.h             |  1 -
 arch/arm64/include/asm/atomic.h           |  2 --
 arch/ia64/include/asm/atomic.h            |  2 --
 arch/mips/include/asm/atomic.h            |  2 --
 arch/parisc/include/asm/atomic.h          |  2 --
 arch/powerpc/include/asm/atomic.h         |  1 +
 arch/riscv/include/asm/atomic.h           |  7 -------
 arch/s390/include/asm/atomic.h            |  1 -
 arch/sparc/include/asm/atomic_64.h        |  2 --
 arch/x86/include/asm/atomic64_32.h        |  2 +-
 arch/x86/include/asm/atomic64_64.h        |  2 --
 include/asm-generic/atomic-instrumented.h |  3 +++
 include/asm-generic/atomic64.h            |  1 -
 include/linux/atomic.h                    | 11 +++++++++++
 16 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 392b15a4dd4f..eb0f25e4c5dd 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -296,8 +296,6 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return old - 1;
 }
 
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
 #define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
 #define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index cecdf3403caf..1406825b5e7d 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -603,7 +603,6 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
-#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1LL, 0LL)
 
 #endif	/* !CONFIG_GENERIC_ATOMIC64 */
 
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 9d56d0727c9b..02f3894faa48 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -534,7 +534,6 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1LL, (v))
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
-#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1LL, 0LL)
 
 #endif /* !CONFIG_GENERIC_ATOMIC64 */
 #endif
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 264d20339f74..ad50412889c5 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -204,7 +204,5 @@
 #define atomic64_add_unless(v, a, u)	(___atomic_add_unless(v, a, u, 64) != u)
 #define atomic64_andnot			atomic64_andnot
 
-#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
-
 #endif
 #endif
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 9d2ddde5f9d5..93d48b823220 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -246,8 +246,6 @@ static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
 	return c != (u);
 }
 
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
 static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long c, old, dec;
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 02fc1553cf9b..502e691c6393 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -644,8 +644,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
 	return c != (u);
 }
 
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
 #define atomic64_dec_return(v) atomic64_sub_return(1, (v))
 #define atomic64_inc_return(v) atomic64_add_return(1, (v))
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 7748abced766..3fd0243bf405 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -305,8 +305,6 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
 	return c != (u);
 }
 
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
 /*
  * atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 1483261080a1..e59620ee4f6b 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -582,6 +582,7 @@ static __inline__ int atomic64_inc_not_zero(atomic64_t *v)
 
 	return t1 != 0;
 }
+#define atomic64_inc_not_zero(v) atomic64_inc_not_zero((v))
 
 #endif /* __powerpc64__ */
 
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 0e27e050ba14..18259e90f57e 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -375,13 +375,6 @@ static __always_inline int atomic64_add_unless(atomic64_t *v, long a, long u)
 }
 #endif
 
-#ifndef CONFIG_GENERIC_ATOMIC64
-static __always_inline long atomic64_inc_not_zero(atomic64_t *v)
-{
-        return atomic64_add_unless(v, 1, 0);
-}
-#endif
-
 /*
  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
  * {cmp,}xchg and the operations that return, so they need a full barrier.
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index c2858cdd8c29..66dac30a4fe1 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -212,6 +212,5 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 #define atomic64_dec(_v)		atomic64_sub(1, _v)
 #define atomic64_dec_return(_v)		atomic64_sub_return(1, _v)
 #define atomic64_dec_and_test(_v)	(atomic64_sub_return(1, _v) == 0)
-#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
 
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index f416fd3d2708..07830a316464 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -123,8 +123,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 	return c != (u);
 }
 
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
 long atomic64_dec_if_positive(atomic64_t *v);
 
 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 92212bf0484f..2a33cc17801b 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -295,7 +295,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a,
 	return (int)a;
 }
 
-
+#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
 static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
 {
 	int r;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6106b59d3260..6f95023894b7 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -207,8 +207,6 @@ static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u)
 	return true;
 }
 
-#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0)
-
 /*
  * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 497faa4a05e3..83bb88d791c4 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -205,11 +205,14 @@ static __always_inline s64 atomic64_dec_return(atomic64_t *v)
 	return arch_atomic64_dec_return(v);
 }
 
+#ifdef arch_atomic64_inc_not_zero
+#define atomic64_inc_not_zero atomic64_inc_not_zero
 static __always_inline bool atomic64_inc_not_zero(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_inc_not_zero(v);
 }
+#endif
 
 static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v)
 {
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index a951a721e1bb..5105275ac825 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -63,6 +63,5 @@ extern bool	 atomic64_add_unless(atomic64_t *v, long long a, long long u);
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
-#define atomic64_inc_not_zero(v) 	atomic64_add_unless((v), 1LL, 0LL)
 
 #endif  /*  _ASM_GENERIC_ATOMIC64_H  */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 307a7f6d619a..ae3f30923d05 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -1019,6 +1019,17 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #define atomic64_try_cmpxchg_release	atomic64_try_cmpxchg
 #endif /* atomic64_try_cmpxchg */
 
+/**
+ * atomic64_inc_not_zero - increment unless the number is zero
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically increments @v by 1, if @v is non-zero.
+ * Returns true if the increment was done.
+ */
+#ifndef atomic64_inc_not_zero
+#define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
+#endif
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
-- 
cgit v1.2.3


From eccc2da8c03f316bba202e15af2be4615f461900 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:09 +0100
Subject: atomics/treewide: Make atomic_fetch_add_unless() optional

Several architectures these have a near-identical implementation based
on atomic_read() and atomic_cmpxchg() which we can instead define in
<linux/atomic.h>, so let's do so, using something close to the existing
x86 implementation with try_cmpxchg().

Where an architecture provides its own atomic_fetch_add_unless(), it
must define a preprocessor symbol for it. The instrumented atomics are
updated accordingly.

Note that arch/arc's existing atomic_fetch_add_unless() had redundant
barriers, as these are already present in its atomic_cmpxchg()
implementation.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineet Gupta <vgupta@synopsys.com>
Link: https://lore.kernel.org/lkml/20180621121321.4761-7-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           |  2 +-
 arch/arc/include/asm/atomic.h             | 28 ----------------------------
 arch/arm/include/asm/atomic.h             | 11 +----------
 arch/arm64/include/asm/atomic.h           |  1 -
 arch/h8300/include/asm/atomic.h           |  1 +
 arch/hexagon/include/asm/atomic.h         |  1 +
 arch/ia64/include/asm/atomic.h            | 16 ----------------
 arch/m68k/include/asm/atomic.h            | 15 ---------------
 arch/mips/include/asm/atomic.h            | 24 ------------------------
 arch/parisc/include/asm/atomic.h          | 24 ------------------------
 arch/powerpc/include/asm/atomic.h         |  1 +
 arch/riscv/include/asm/atomic.h           |  1 +
 arch/s390/include/asm/atomic.h            | 15 ---------------
 arch/sh/include/asm/atomic.h              | 25 -------------------------
 arch/sparc/include/asm/atomic_32.h        |  2 ++
 arch/sparc/include/asm/atomic_64.h        | 15 ---------------
 arch/x86/include/asm/atomic.h             | 21 ---------------------
 arch/xtensa/include/asm/atomic.h          | 24 ------------------------
 include/asm-generic/atomic-instrumented.h |  4 +++-
 include/asm-generic/atomic.h              | 11 -----------
 include/linux/atomic.h                    | 23 +++++++++++++++++++++++
 21 files changed, 34 insertions(+), 231 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index eb0f25e4c5dd..4a800a3424a3 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -235,7 +235,7 @@ static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 	smp_mb();
 	return old;
 }
-
+#define atomic_fetch_add_unless atomic_fetch_add_unless
 
 /**
  * atomic64_add_unless - add unless the number is a given value
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 1406825b5e7d..60da80481c5d 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -308,34 +308,6 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-/**
- * atomic_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v
- */
-#define atomic_fetch_add_unless(v, a, u)					\
-({									\
-	int c, old;							\
-									\
-	/*								\
-	 * Explicit full memory barrier needed before/after as		\
-	 * LLOCK/SCOND thmeselves don't provide any such semantics	\
-	 */								\
-	smp_mb();							\
-									\
-	c = atomic_read(v);						\
-	while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c)\
-		c = old;						\
-									\
-	smp_mb();							\
-									\
-	c;								\
-})
-
 #define atomic_inc(v)			atomic_add(1, v)
 #define atomic_dec(v)			atomic_sub(1, v)
 
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 02f3894faa48..74460aa00fa0 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -156,6 +156,7 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 
 	return oldval;
 }
+#define atomic_fetch_add_unless		atomic_fetch_add_unless
 
 #else /* ARM_ARCH_6 */
 
@@ -215,16 +216,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
-static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-
-	c = atomic_read(v);
-	while (c != u && (old = atomic_cmpxchg((v), c, c + a)) != c)
-		c = old;
-	return c;
-}
-
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define ATOMIC_OPS(op, c_op, asm_op)					\
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index ad50412889c5..22c8c43d6689 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -125,7 +125,6 @@
 #define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
 #define atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
 #define atomic_add_negative(i, v)	(atomic_add_return((i), (v)) < 0)
-#define atomic_fetch_add_unless(v, a, u)	___atomic_add_unless(v, a, u,)
 #define atomic_andnot			atomic_andnot
 
 /*
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 5c856887fdf2..710364946308 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -106,5 +106,6 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 	arch_local_irq_restore(flags);
 	return ret;
 }
+#define atomic_fetch_add_unless		atomic_fetch_add_unless
 
 #endif /* __ARCH_H8300_ATOMIC __ */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index d2feeba93c44..86c67e9adbfa 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -196,6 +196,7 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 	);
 	return __oldval;
 }
+#define atomic_fetch_add_unless atomic_fetch_add_unless
 
 #define atomic_inc(v) atomic_add(1, (v))
 #define atomic_dec(v) atomic_sub(1, (v))
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 93d48b823220..cfe44086338e 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -215,22 +215,6 @@ ATOMIC64_FETCH_OP(xor, ^)
 	(cmpxchg(&((v)->counter), old, new))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
-
 static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
 {
 	long c, old;
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 8022d9ea1213..596882cda224 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -211,19 +211,4 @@ static inline int atomic_add_negative(int i, atomic_t *v)
 	return c != 0;
 }
 
-static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #endif /* __ARCH_M68K_ATOMIC __ */
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 502e691c6393..794734e730d9 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -274,30 +274,6 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
 
-/**
- * atomic_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #define atomic_dec_return(v) atomic_sub_return(1, (v))
 #define atomic_inc_return(v) atomic_add_return(1, (v))
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 3fd0243bf405..b2b6261d05e7 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -77,30 +77,6 @@ static __inline__ int atomic_read(const atomic_t *v)
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
-/**
- * atomic_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #define ATOMIC_OP(op, c_op)						\
 static __inline__ void atomic_##op(int i, atomic_t *v)			\
 {									\
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index e59620ee4f6b..b5646c079c16 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -248,6 +248,7 @@ static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 
 	return t;
 }
+#define atomic_fetch_add_unless atomic_fetch_add_unless
 
 /**
  * atomic_inc_not_zero - increment unless the number is zero
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 18259e90f57e..5f161daefcd2 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -349,6 +349,7 @@ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 		: "memory");
 	return prev;
 }
+#define atomic_fetch_add_unless atomic_fetch_add_unless
 
 #ifndef CONFIG_GENERIC_ATOMIC64
 static __always_inline long __atomic64_add_unless(atomic64_t *v, long a, long u)
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 66dac30a4fe1..26c6b713a7a3 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -90,21 +90,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return __atomic_cmpxchg(&v->counter, old, new);
 }
 
-static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == u))
-			break;
-		old = atomic_cmpxchg(v, c, c + a);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #define ATOMIC64_INIT(i)  { (i) }
 
 static inline long atomic64_read(const atomic64_t *v)
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index ef45931ebac5..422fac764ca1 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -45,31 +45,6 @@
 #define atomic_xchg(v, new)		(xchg(&((v)->counter), new))
 #define atomic_cmpxchg(v, o, n)		(cmpxchg(&((v)->counter), (o), (n)))
 
-/**
- * atomic_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-
-	return c;
-}
-
 #endif /* CONFIG_CPU_J2 */
 
 #endif /* __ASM_SH_ATOMIC_H */
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index a58f4b43bcc7..9d7a15acc0c5 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -30,6 +30,8 @@ int atomic_xchg(atomic_t *, int);
 int atomic_fetch_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
 
+#define atomic_fetch_add_unless	atomic_fetch_add_unless
+
 #define atomic_set_release(v, i)	atomic_set((v), (i))
 
 #define atomic_read(v)          READ_ONCE((v)->counter)
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 07830a316464..e4f1c93db31f 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -89,21 +89,6 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
-static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #define atomic64_cmpxchg(v, o, n) \
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 84ed0bd76aef..616327ac9d39 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -253,27 +253,6 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
 	return val;
 }
 
-/**
- * arch_atomic_fetch_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns the old value of @v.
- */
-static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c = arch_atomic_read(v);
-
-	do {
-		if (unlikely(c == u))
-			break;
-	} while (!arch_atomic_try_cmpxchg(v, &c, c + a));
-
-	return c;
-}
-
 #ifdef CONFIG_X86_32
 # include <asm/atomic64_32.h>
 #else
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 4188e56c06c9..f4c9f82c40c6 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -274,30 +274,6 @@ ATOMIC_OPS(xor)
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
-/**
- * atomic_fetch_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c;
-}
-
 #endif /* __KERNEL__ */
 
 #endif /* _XTENSA_ATOMIC_H */
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 83bb88d791c4..1f9b2a767d3c 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -84,12 +84,14 @@ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 ne
 }
 #endif
 
+#ifdef arch_atomic_fetch_add_unless
+#define atomic_fetch_add_unless atomic_fetch_add_unless
 static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_fetch_add_unless(v, a, u);
 }
-
+#endif
 
 static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 {
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 10051ed6d088..757e45821220 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -221,15 +221,4 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
-#ifndef atomic_fetch_add_unless
-static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c, old;
-	c = atomic_read(v);
-	while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
-		c = old;
-	return c;
-}
-#endif
-
 #endif /* __ASM_GENERIC_ATOMIC_H */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index ae3f30923d05..b89ba36cab94 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -521,6 +521,29 @@
 #endif
 #endif /* xchg_relaxed */
 
+/**
+ * atomic_fetch_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, if @v was not already @u.
+ * Returns the original value of @v.
+ */
+#ifndef atomic_fetch_add_unless
+static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+	int c = atomic_read(v);
+
+	do {
+		if (unlikely(c == u))
+			break;
+	} while (!atomic_try_cmpxchg(v, &c, c + a));
+
+	return c;
+}
+#endif
+
 /**
  * atomic_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
-- 
cgit v1.2.3


From 0ae1d994020d75ac065fd42ac4cbf5ac6ce9b255 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:10 +0100
Subject: atomics: Prepare for atomic64_fetch_add_unless()

Currently all architectures must implement atomic_fetch_add_unless(),
with common code providing atomic_add_unless(). Architectures must also
implement atomic64_add_unless() directly, with no corresponding
atomic64_fetch_add_unless().

This divergence is unfortunate, and means that the APIs for atomic_t,
atomic64_t, and atomic_long_t differ.

In preparation for unifying things, with architectures providing
atomic64_fetch_add_unless, this patch adds a generic
atomic64_add_unless() which will use atomic64_fetch_add_unless(). The
instrumented atomics are updated to take this case into account.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Albert Ou <albert@sifive.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineet Gupta <vgupta@synopsys.com>
Link: https://lore.kernel.org/lkml/20180621121321.4761-8-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/atomic-instrumented.h |  9 +++++++++
 include/linux/atomic.h                    | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 1f9b2a767d3c..444bf2f9d54d 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -93,11 +93,20 @@ static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 }
 #endif
 
+#ifdef arch_atomic64_fetch_add_unless
+#define atomic64_fetch_add_unless atomic64_fetch_add_unless
+static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
+{
+	kasan_check_write(v, sizeof(*v));
+	return arch_atomic64_fetch_add_unless(v, a, u);
+}
+#else
 static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_add_unless(v, a, u);
 }
+#endif
 
 static __always_inline void atomic_inc(atomic_t *v)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index b89ba36cab94..3c03de648007 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -1042,6 +1042,22 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #define atomic64_try_cmpxchg_release	atomic64_try_cmpxchg
 #endif /* atomic64_try_cmpxchg */
 
+/**
+ * atomic64_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, if @v was not already @u.
+ * Returns true if the addition was done.
+ */
+#ifdef atomic64_fetch_add_unless
+static inline bool atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+	return atomic64_fetch_add_unless(v, a, u) != u;
+}
+#endif
+
 /**
  * atomic64_inc_not_zero - increment unless the number is zero
  * @v: pointer of type atomic64_t
-- 
cgit v1.2.3


From 356701329fb391184618eda7b7fb68cb35271506 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:17 +0100
Subject: atomics/treewide: Make atomic64_fetch_add_unless() optional

Architectures with atomic64_fetch_add_unless() provide a preprocessor
symbol if they do so, and all other architectures have trivial C
implementations of atomic64_add_unless() which are near-identical.

Let's unify the trivial definitions of atomic64_fetch_add_unless() in
<linux/atomic.h>, so that we always have both
atomic64_fetch_add_unless() and atomic64_add_unless() with less
boilerplate code.

This means that atomic64_add_unless() is always implemented in core
code, and the instrumented atomics are updated accordingly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-15-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/include/asm/atomic.h           | 12 ------------
 arch/ia64/include/asm/atomic.h            | 15 ---------------
 arch/mips/include/asm/atomic.h            | 24 ------------------------
 arch/parisc/include/asm/atomic.h          | 24 ------------------------
 arch/s390/include/asm/atomic.h            | 16 ----------------
 arch/sparc/include/asm/atomic_64.h        | 15 ---------------
 arch/x86/include/asm/atomic64_64.h        | 19 -------------------
 include/asm-generic/atomic-instrumented.h |  6 ------
 include/linux/atomic.h                    | 26 ++++++++++++++++++++++++--
 9 files changed, 24 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 22c8c43d6689..82db0e4febd4 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -40,17 +40,6 @@
 
 #include <asm/cmpxchg.h>
 
-#define ___atomic_add_unless(v, a, u, sfx)				\
-({									\
-	typeof((v)->counter) c, old;					\
-									\
-	c = atomic##sfx##_read(v);					\
-	while (c != (u) &&						\
-	      (old = atomic##sfx##_cmpxchg((v), c, c + (a))) != c)	\
-		c = old;						\
-	c;								\
- })
-
 #define ATOMIC_INIT(i)	{ (i) }
 
 #define atomic_read(v)			READ_ONCE((v)->counter)
@@ -200,7 +189,6 @@
 #define atomic64_dec_and_test(v)	(atomic64_dec_return(v) == 0)
 #define atomic64_sub_and_test(i, v)	(atomic64_sub_return((i), (v)) == 0)
 #define atomic64_add_negative(i, v)	(atomic64_add_return((i), (v)) < 0)
-#define atomic64_add_unless(v, a, u)	(___atomic_add_unless(v, a, u, 64) != u)
 #define atomic64_andnot			atomic64_andnot
 
 #endif
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index cfe44086338e..0f80a3eafaba 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -215,21 +215,6 @@ ATOMIC64_FETCH_OP(xor, ^)
 	(cmpxchg(&((v)->counter), old, new))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
-{
-	long c, old;
-	c = atomic64_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic64_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c != (u);
-}
-
 static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long c, old, dec;
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 794734e730d9..d42b27df1548 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -596,30 +596,6 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), (new)))
 
-/**
- * atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns true iff @v was not @u.
- */
-static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
-{
-	long c, old;
-	c = atomic64_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic64_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c != (u);
-}
-
 #define atomic64_dec_return(v) atomic64_sub_return(1, (v))
 #define atomic64_inc_return(v) atomic64_add_return(1, (v))
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index b2b6261d05e7..f53ba2d6ff67 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -257,30 +257,6 @@ atomic64_read(const atomic64_t *v)
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-/**
- * atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
-{
-	long c, old;
-	c = atomic64_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic64_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c != (u);
-}
-
 /*
  * atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 26c6b713a7a3..eb9329741bad 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -153,22 +153,6 @@ ATOMIC64_OPS(xor)
 
 #undef ATOMIC64_OPS
 
-static inline int atomic64_add_unless(atomic64_t *v, long i, long u)
-{
-	long c, old;
-
-	c = atomic64_read(v);
-	for (;;) {
-		if (unlikely(c == u))
-			break;
-		old = atomic64_cmpxchg(v, c, c + i);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c != u;
-}
-
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
 	long c, old, dec;
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index e4f1c93db31f..458783e99997 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -93,21 +93,6 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
-{
-	long c, old;
-	c = atomic64_read(v);
-	for (;;) {
-		if (unlikely(c == (u)))
-			break;
-		old = atomic64_cmpxchg((v), c, c + (a));
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return c != (u);
-}
-
 long atomic64_dec_if_positive(atomic64_t *v);
 
 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6f95023894b7..7e04b294e6eb 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -188,25 +188,6 @@ static inline long arch_atomic64_xchg(atomic64_t *v, long new)
 	return xchg(&v->counter, new);
 }
 
-/**
- * arch_atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u)
-{
-	s64 c = arch_atomic64_read(v);
-	do {
-		if (unlikely(c == u))
-			return false;
-	} while (!arch_atomic64_try_cmpxchg(v, &c, c + a));
-	return true;
-}
-
 /*
  * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 444bf2f9d54d..2b487f28ef35 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -100,12 +100,6 @@ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_fetch_add_unless(v, a, u);
 }
-#else
-static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
-{
-	kasan_check_write(v, sizeof(*v));
-	return arch_atomic64_add_unless(v, a, u);
-}
 #endif
 
 static __always_inline void atomic_inc(atomic_t *v)
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 3c03de648007..530562ac7909 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -1042,6 +1042,30 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #define atomic64_try_cmpxchg_release	atomic64_try_cmpxchg
 #endif /* atomic64_try_cmpxchg */
 
+/**
+ * atomic64_fetch_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, if @v was not already @u.
+ * Returns the original value of @v.
+ */
+#ifndef atomic64_fetch_add_unless
+static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
+						  long long u)
+{
+	long long c = atomic64_read(v);
+
+	do {
+		if (unlikely(c == u))
+			break;
+	} while (!atomic64_try_cmpxchg(v, &c, c + a));
+
+	return c;
+}
+#endif
+
 /**
  * atomic64_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
@@ -1051,12 +1075,10 @@ static inline int atomic_dec_if_positive(atomic_t *v)
  * Atomically adds @a to @v, if @v was not already @u.
  * Returns true if the addition was done.
  */
-#ifdef atomic64_fetch_add_unless
 static inline bool atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
 	return atomic64_fetch_add_unless(v, a, u) != u;
 }
-#endif
 
 /**
  * atomic64_inc_not_zero - increment unless the number is zero
-- 
cgit v1.2.3


From 18cc1814d4e7560412c9c8c6d28f9d6782c8b402 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:18 +0100
Subject: atomics/treewide: Make test ops optional

Some of the atomics return the result of a test applied after the atomic
operation, and almost all architectures implement these as trivial
wrappers around the underlying atomic. Specifically:

 * <atomic>_inc_and_test(v)    is (<atomic>_inc_return(v)    == 0)
 * <atomic>_dec_and_test(v)    is (<atomic>_dec_return(v)    == 0)
 * <atomic>_sub_and_test(i, v) is (<atomic>_sub_return(i, v) == 0)
 * <atomic>_add_negative(i, v) is (<atomic>_add_return(i, v)  < 0)

Rather than have these definitions duplicated in all architectures, with
minor inconsistencies in formatting and documentation, let's make these
operations optional, with default fallbacks as above. Implementations
must now provide a preprocessor symbol.

The instrumented atomics are updated accordingly.

Both x86 and m68k have custom implementations, which are left as-is,
given preprocessor symbols to avoid being overridden.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-16-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           |  12 ---
 arch/arc/include/asm/atomic.h             |  10 ---
 arch/arm/include/asm/atomic.h             |   9 ---
 arch/arm64/include/asm/atomic.h           |   8 --
 arch/h8300/include/asm/atomic.h           |   5 --
 arch/hexagon/include/asm/atomic.h         |   5 --
 arch/ia64/include/asm/atomic.h            |  23 ------
 arch/m68k/include/asm/atomic.h            |   4 +
 arch/mips/include/asm/atomic.h            |  84 --------------------
 arch/parisc/include/asm/atomic.h          |  22 ------
 arch/powerpc/include/asm/atomic.h         |  30 --------
 arch/riscv/include/asm/atomic.h           |  46 -----------
 arch/s390/include/asm/atomic.h            |   8 --
 arch/sh/include/asm/atomic.h              |   4 -
 arch/sparc/include/asm/atomic_32.h        |  15 ----
 arch/sparc/include/asm/atomic_64.h        |  20 -----
 arch/x86/include/asm/atomic.h             |   4 +
 arch/x86/include/asm/atomic64_32.h        |  54 -------------
 arch/x86/include/asm/atomic64_64.h        |   4 +
 arch/xtensa/include/asm/atomic.h          |  42 ----------
 include/asm-generic/atomic-instrumented.h |  24 ++++++
 include/asm-generic/atomic.h              |   9 ---
 include/asm-generic/atomic64.h            |   4 -
 include/linux/atomic.h                    | 124 ++++++++++++++++++++++++++++++
 24 files changed, 160 insertions(+), 410 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index cc486dbb3837..25f8693c5a42 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -297,24 +297,12 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return old - 1;
 }
 
-#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
-#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
-
 #define atomic_dec_return(v) atomic_sub_return(1,(v))
 #define atomic64_dec_return(v) atomic64_sub_return(1,(v))
 
 #define atomic_inc_return(v) atomic_add_return(1,(v))
 #define atomic64_inc_return(v) atomic64_add_return(1,(v))
 
-#define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0)
-#define atomic64_sub_and_test(i,v) (atomic64_sub_return((i), (v)) == 0)
-
-#define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0)
-#define atomic64_inc_and_test(v) (atomic64_add_return(1, (v)) == 0)
-
-#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
-#define atomic64_dec_and_test(v) (atomic64_sub_return(1, (v)) == 0)
-
 #define atomic_inc(v) atomic_add(1,(v))
 #define atomic64_inc(v) atomic64_add(1,(v))
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 4917ffa61579..4222e726f84c 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -311,14 +311,8 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
 #define atomic_inc(v)			atomic_add(1, v)
 #define atomic_dec(v)			atomic_sub(1, v)
 
-#define atomic_inc_and_test(v)		(atomic_add_return(1, v) == 0)
-#define atomic_dec_and_test(v)		(atomic_sub_return(1, v) == 0)
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
-#define atomic_sub_and_test(i, v)	(atomic_sub_return(i, v) == 0)
-
-#define atomic_add_negative(i, v)	(atomic_add_return(i, v) < 0)
-
 
 #ifdef CONFIG_GENERIC_ATOMIC64
 
@@ -566,14 +560,10 @@ static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
 }
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
 #define atomic64_inc_return(v)		atomic64_add_return(1LL, (v))
-#define atomic64_inc_and_test(v)	(atomic64_inc_return(v) == 0)
-#define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
-#define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 
 #endif	/* !CONFIG_GENERIC_ATOMIC64 */
 
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 852e1fee72b0..35fb7f504daa 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -248,13 +248,8 @@ ATOMIC_OPS(xor, ^=, eor)
 #define atomic_inc(v)		atomic_add(1, v)
 #define atomic_dec(v)		atomic_sub(1, v)
 
-#define atomic_inc_and_test(v)	(atomic_add_return(1, v) == 0)
-#define atomic_dec_and_test(v)	(atomic_sub_return(1, v) == 0)
 #define atomic_inc_return_relaxed(v)    (atomic_add_return_relaxed(1, v))
 #define atomic_dec_return_relaxed(v)    (atomic_sub_return_relaxed(1, v))
-#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
-
-#define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0)
 
 #ifndef CONFIG_GENERIC_ATOMIC64
 typedef struct {
@@ -517,14 +512,10 @@ static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
 }
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
 #define atomic64_inc_return_relaxed(v)	atomic64_add_return_relaxed(1LL, (v))
-#define atomic64_inc_and_test(v)	(atomic64_inc_return(v) == 0)
-#define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1LL, (v))
-#define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 
 #endif /* !CONFIG_GENERIC_ATOMIC64 */
 #endif
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 82db0e4febd4..edbe53fa3106 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -110,10 +110,6 @@
 
 #define atomic_inc(v)			atomic_add(1, (v))
 #define atomic_dec(v)			atomic_sub(1, (v))
-#define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
-#define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
-#define atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
-#define atomic_add_negative(i, v)	(atomic_add_return((i), (v)) < 0)
 #define atomic_andnot			atomic_andnot
 
 /*
@@ -185,10 +181,6 @@
 
 #define atomic64_inc(v)			atomic64_add(1, (v))
 #define atomic64_dec(v)			atomic64_sub(1, (v))
-#define atomic64_inc_and_test(v)	(atomic64_inc_return(v) == 0)
-#define atomic64_dec_and_test(v)	(atomic64_dec_return(v) == 0)
-#define atomic64_sub_and_test(i, v)	(atomic64_sub_return((i), (v)) == 0)
-#define atomic64_add_negative(i, v)	(atomic64_add_return((i), (v)) < 0)
 #define atomic64_andnot			atomic64_andnot
 
 #endif
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 710364946308..8977b5157c8f 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -69,17 +69,12 @@ ATOMIC_OPS(sub, -=)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
-#define atomic_sub_and_test(i, v)	(atomic_sub_return(i, v) == 0)
-
 #define atomic_inc_return(v)		atomic_add_return(1, v)
 #define atomic_dec_return(v)		atomic_sub_return(1, v)
 
 #define atomic_inc(v)			(void)atomic_inc_return(v)
-#define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
 
 #define atomic_dec(v)			(void)atomic_dec_return(v)
-#define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 86c67e9adbfa..31638f511674 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -201,11 +201,6 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 #define atomic_inc(v) atomic_add(1, (v))
 #define atomic_dec(v) atomic_sub(1, (v))
 
-#define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0)
-#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
-#define atomic_sub_and_test(i, v) (atomic_sub_return(i, (v)) == 0)
-#define atomic_add_negative(i, v) (atomic_add_return(i, (v)) < 0)
-
 #define atomic_inc_return(v) (atomic_add_return(1, v))
 #define atomic_dec_return(v) (atomic_sub_return(1, v))
 
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 0f80a3eafaba..e4143c462e65 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -231,34 +231,11 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-/*
- * Atomically add I to V and return TRUE if the resulting value is
- * negative.
- */
-static __inline__ int
-atomic_add_negative (int i, atomic_t *v)
-{
-	return atomic_add_return(i, v) < 0;
-}
-
-static __inline__ long
-atomic64_add_negative (__s64 i, atomic64_t *v)
-{
-	return atomic64_add_return(i, v) < 0;
-}
-
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
 #define atomic64_inc_return(v)		atomic64_add_return(1, (v))
 
-#define atomic_sub_and_test(i,v)	(atomic_sub_return((i), (v)) == 0)
-#define atomic_dec_and_test(v)		(atomic_sub_return(1, (v)) == 0)
-#define atomic_inc_and_test(v)		(atomic_add_return(1, (v)) == 0)
-#define atomic64_sub_and_test(i,v)	(atomic64_sub_return((i), (v)) == 0)
-#define atomic64_dec_and_test(v)	(atomic64_sub_return(1, (v)) == 0)
-#define atomic64_inc_and_test(v)	(atomic64_add_return(1, (v)) == 0)
-
 #define atomic_add(i,v)			(void)atomic_add_return((i), (v))
 #define atomic_sub(i,v)			(void)atomic_sub_return((i), (v))
 #define atomic_inc(v)			atomic_add(1, (v))
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 596882cda224..9df09c876fa2 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -138,6 +138,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
 	__asm__ __volatile__("subql #1,%1; seq %0" : "=d" (c), "+m" (*v));
 	return c != 0;
 }
+#define atomic_dec_and_test atomic_dec_and_test
 
 static inline int atomic_dec_and_test_lt(atomic_t *v)
 {
@@ -155,6 +156,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
 	__asm__ __volatile__("addql #1,%1; seq %0" : "=d" (c), "+m" (*v));
 	return c != 0;
 }
+#define atomic_inc_and_test atomic_inc_and_test
 
 #ifdef CONFIG_RMW_INSNS
 
@@ -201,6 +203,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
 			     : ASM_DI (i));
 	return c != 0;
 }
+#define atomic_sub_and_test atomic_sub_and_test
 
 static inline int atomic_add_negative(int i, atomic_t *v)
 {
@@ -210,5 +213,6 @@ static inline int atomic_add_negative(int i, atomic_t *v)
 			     : ASM_DI (i));
 	return c != 0;
 }
+#define atomic_add_negative atomic_add_negative
 
 #endif /* __ARCH_M68K_ATOMIC __ */
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index d42b27df1548..fd3008ae164c 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -277,37 +277,6 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
 #define atomic_dec_return(v) atomic_sub_return(1, (v))
 #define atomic_inc_return(v) atomic_add_return(1, (v))
 
-/*
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0)
-
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-
-/*
- * atomic_dec_and_test - decrement by 1 and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
-
 /*
  * atomic_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
@@ -330,17 +299,6 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
  */
 #define atomic_dec(v) atomic_sub(1, (v))
 
-/*
- * atomic_add_negative - add and test if negative
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-#define atomic_add_negative(i, v) (atomic_add_return(i, (v)) < 0)
-
 #ifdef CONFIG_64BIT
 
 #define ATOMIC64_INIT(i)    { (i) }
@@ -599,37 +557,6 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
 #define atomic64_dec_return(v) atomic64_sub_return(1, (v))
 #define atomic64_inc_return(v) atomic64_add_return(1, (v))
 
-/*
- * atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-#define atomic64_sub_and_test(i, v) (atomic64_sub_return((i), (v)) == 0)
-
-/*
- * atomic64_inc_and_test - increment and test
- * @v: pointer of type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
-
-/*
- * atomic64_dec_and_test - decrement by 1 and test
- * @v: pointer of type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-#define atomic64_dec_and_test(v) (atomic64_sub_return(1, (v)) == 0)
-
 /*
  * atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic64_t
@@ -652,17 +579,6 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
  */
 #define atomic64_dec(v) atomic64_sub(1, (v))
 
-/*
- * atomic64_add_negative - add and test if negative
- * @v: pointer of type atomic64_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-#define atomic64_add_negative(i, v) (atomic64_add_return(i, (v)) < 0)
-
 #endif /* CONFIG_64BIT */
 
 #endif /* _ASM_ATOMIC_H */
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index f53ba2d6ff67..f85844ff6336 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -142,22 +142,6 @@ ATOMIC_OPS(xor, ^=)
 #define atomic_inc_return(v)	(atomic_add_return(   1,(v)))
 #define atomic_dec_return(v)	(atomic_add_return(  -1,(v)))
 
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
-
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-
-#define atomic_dec_and_test(v)	(atomic_dec_return(v) == 0)
-
-#define atomic_sub_and_test(i,v)	(atomic_sub_return((i),(v)) == 0)
-
 #define ATOMIC_INIT(i)	{ (i) }
 
 #ifdef CONFIG_64BIT
@@ -246,12 +230,6 @@ atomic64_read(const atomic64_t *v)
 #define atomic64_inc_return(v)		(atomic64_add_return(   1,(v)))
 #define atomic64_dec_return(v)		(atomic64_add_return(  -1,(v)))
 
-#define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
-
-#define atomic64_inc_and_test(v) 	(atomic64_inc_return(v) == 0)
-#define atomic64_dec_and_test(v)	(atomic64_dec_return(v) == 0)
-#define atomic64_sub_and_test(i,v)	(atomic64_sub_return((i),(v)) == 0)
-
 /* exported interface */
 #define atomic64_cmpxchg(v, o, n) \
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 233dbf31911c..5d76f05d2be3 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -129,8 +129,6 @@ ATOMIC_OPS(xor, xor)
 #undef ATOMIC_OP_RETURN_RELAXED
 #undef ATOMIC_OP
 
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
-
 static __inline__ void atomic_inc(atomic_t *v)
 {
 	int t;
@@ -163,16 +161,6 @@ static __inline__ int atomic_inc_return_relaxed(atomic_t *v)
 	return t;
 }
 
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-
 static __inline__ void atomic_dec(atomic_t *v)
 {
 	int t;
@@ -281,9 +269,6 @@ static __inline__ int atomic_inc_not_zero(atomic_t *v)
 }
 #define atomic_inc_not_zero(v) atomic_inc_not_zero((v))
 
-#define atomic_sub_and_test(a, v)	(atomic_sub_return((a), (v)) == 0)
-#define atomic_dec_and_test(v)		(atomic_dec_return((v)) == 0)
-
 /*
  * Atomically test *v and decrement if it is greater than 0.
  * The function returns the old value of *v minus 1, even if
@@ -413,8 +398,6 @@ ATOMIC64_OPS(xor, xor)
 #undef ATOMIC64_OP_RETURN_RELAXED
 #undef ATOMIC64_OP
 
-#define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
-
 static __inline__ void atomic64_inc(atomic64_t *v)
 {
 	long t;
@@ -445,16 +428,6 @@ static __inline__ long atomic64_inc_return_relaxed(atomic64_t *v)
 	return t;
 }
 
-/*
- * atomic64_inc_and_test - increment and test
- * @v: pointer of type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
-
 static __inline__ void atomic64_dec(atomic64_t *v)
 {
 	long t;
@@ -488,9 +461,6 @@ static __inline__ long atomic64_dec_return_relaxed(atomic64_t *v)
 #define atomic64_inc_return_relaxed atomic64_inc_return_relaxed
 #define atomic64_dec_return_relaxed atomic64_dec_return_relaxed
 
-#define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
-#define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
-
 /*
  * Atomically test *v and decrement if it is greater than 0.
  * The function returns the old value of *v minus 1.
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index d959bbaaad41..68eef0a805ca 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -209,36 +209,6 @@ ATOMIC_OPS(xor, xor, i)
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 
-/*
- * The extra atomic operations that are constructed from one of the core
- * AMO-based operations above (aside from sub, which is easier to fit above).
- * These are required to perform a full barrier, but they're OK this way
- * because atomic_*_return is also required to perform a full barrier.
- *
- */
-#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)		\
-static __always_inline							\
-bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
-{									\
-	return atomic##prefix##_##func_op##_return(i, v) comp_op I;	\
-}
-
-#ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, func_op, comp_op, I)				\
-        ATOMIC_OP(op, func_op, comp_op, I,  int,   )
-#else
-#define ATOMIC_OPS(op, func_op, comp_op, I)				\
-        ATOMIC_OP(op, func_op, comp_op, I,  int,   )			\
-        ATOMIC_OP(op, func_op, comp_op, I, long, 64)
-#endif
-
-ATOMIC_OPS(add_and_test, add, ==, 0)
-ATOMIC_OPS(sub_and_test, sub, ==, 0)
-ATOMIC_OPS(add_negative, add,  <, 0)
-
-#undef ATOMIC_OP
-#undef ATOMIC_OPS
-
 #define ATOMIC_OP(op, func_op, I, c_type, prefix)			\
 static __always_inline							\
 void atomic##prefix##_##op(atomic##prefix##_t *v)			\
@@ -315,22 +285,6 @@ ATOMIC_OPS(dec, add, +, -1)
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 
-#define ATOMIC_OP(op, func_op, comp_op, I, prefix)			\
-static __always_inline							\
-bool atomic##prefix##_##op(atomic##prefix##_t *v)			\
-{									\
-	return atomic##prefix##_##func_op##_return(v) comp_op I;	\
-}
-
-ATOMIC_OP(inc_and_test, inc, ==, 0,   )
-ATOMIC_OP(dec_and_test, dec, ==, 0,   )
-#ifndef CONFIG_GENERIC_ATOMIC64
-ATOMIC_OP(inc_and_test, inc, ==, 0, 64)
-ATOMIC_OP(dec_and_test, dec, ==, 0, 64)
-#endif
-
-#undef ATOMIC_OP
-
 /* This is required to provide a full barrier on success. */
 static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index eb9329741bad..7f5fbd595f01 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -55,17 +55,13 @@ static inline void atomic_add(int i, atomic_t *v)
 	__atomic_add(i, &v->counter);
 }
 
-#define atomic_add_negative(_i, _v)	(atomic_add_return(_i, _v) < 0)
 #define atomic_inc(_v)			atomic_add(1, _v)
 #define atomic_inc_return(_v)		atomic_add_return(1, _v)
-#define atomic_inc_and_test(_v)		(atomic_add_return(1, _v) == 0)
 #define atomic_sub(_i, _v)		atomic_add(-(int)(_i), _v)
 #define atomic_sub_return(_i, _v)	atomic_add_return(-(int)(_i), _v)
 #define atomic_fetch_sub(_i, _v)	atomic_fetch_add(-(int)(_i), _v)
-#define atomic_sub_and_test(_i, _v)	(atomic_sub_return(_i, _v) == 0)
 #define atomic_dec(_v)			atomic_sub(1, _v)
 #define atomic_dec_return(_v)		atomic_sub_return(1, _v)
-#define atomic_dec_and_test(_v)		(atomic_sub_return(1, _v) == 0)
 
 #define ATOMIC_OPS(op)							\
 static inline void atomic_##op(int i, atomic_t *v)			\
@@ -170,16 +166,12 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-#define atomic64_add_negative(_i, _v)	(atomic64_add_return(_i, _v) < 0)
 #define atomic64_inc(_v)		atomic64_add(1, _v)
 #define atomic64_inc_return(_v)		atomic64_add_return(1, _v)
-#define atomic64_inc_and_test(_v)	(atomic64_add_return(1, _v) == 0)
 #define atomic64_sub_return(_i, _v)	atomic64_add_return(-(long)(_i), _v)
 #define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(long)(_i), _v)
 #define atomic64_sub(_i, _v)		atomic64_add(-(long)(_i), _v)
-#define atomic64_sub_and_test(_i, _v)	(atomic64_sub_return(_i, _v) == 0)
 #define atomic64_dec(_v)		atomic64_sub(1, _v)
 #define atomic64_dec_return(_v)		atomic64_sub_return(1, _v)
-#define atomic64_dec_and_test(_v)	(atomic64_sub_return(1, _v) == 0)
 
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 422fac764ca1..d438494fa112 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -32,12 +32,8 @@
 #include <asm/atomic-irq.h>
 #endif
 
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
-#define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
-#define atomic_sub_and_test(i,v)	(atomic_sub_return((i), (v)) == 0)
-#define atomic_dec_and_test(v)		(atomic_sub_return(1, (v)) == 0)
 
 #define atomic_inc(v)			atomic_add(1, (v))
 #define atomic_dec(v)			atomic_sub(1, (v))
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 9d7a15acc0c5..3a26573790c6 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -51,19 +51,4 @@ void atomic_set(atomic_t *, int);
 #define atomic_inc_return(v)	(atomic_add_return(        1, (v)))
 #define atomic_dec_return(v)	(atomic_add_return(       -1, (v)))
 
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
-
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-
-#define atomic_dec_and_test(v) (atomic_dec_return(v) == 0)
-#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
-
 #endif /* !(__ARCH_SPARC_ATOMIC__) */
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 458783e99997..634508282aea 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -56,32 +56,12 @@ ATOMIC_OPS(xor)
 #define atomic_inc_return(v)   atomic_add_return(1, v)
 #define atomic64_inc_return(v) atomic64_add_return(1, v)
 
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
-
-#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
-#define atomic64_sub_and_test(i, v) (atomic64_sub_return(i, v) == 0)
-
-#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
-#define atomic64_dec_and_test(v) (atomic64_sub_return(1, v) == 0)
-
 #define atomic_inc(v) atomic_add(1, v)
 #define atomic64_inc(v) atomic64_add(1, v)
 
 #define atomic_dec(v) atomic_sub(1, v)
 #define atomic64_dec(v) atomic64_sub(1, v)
 
-#define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0)
-#define atomic64_add_negative(i, v) (atomic64_add_return(i, v) < 0)
-
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 
 static inline int atomic_xchg(atomic_t *v, int new)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 616327ac9d39..73bda4abe180 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -80,6 +80,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v)
  * true if the result is zero, or false for all
  * other cases.
  */
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
 static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
@@ -117,6 +118,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v)
  * returns true if the result is 0, or false for all other
  * cases.
  */
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
 static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
@@ -130,6 +132,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
  * and returns true if the result is zero, or false for all
  * other cases.
  */
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
 static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
@@ -144,6 +147,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
+#define arch_atomic_add_negative arch_atomic_add_negative
 static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 2a33cc17801b..a26810d005e0 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -197,20 +197,6 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v)
 	return i;
 }
 
-/**
- * arch_atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v)
-{
-	return arch_atomic64_sub_return(i, v) == 0;
-}
-
 /**
  * arch_atomic64_inc - increment atomic64 variable
  * @v: pointer to type atomic64_t
@@ -235,46 +221,6 @@ static inline void arch_atomic64_dec(atomic64_t *v)
 			       "S" (v) : "memory", "eax", "ecx", "edx");
 }
 
-/**
- * arch_atomic64_dec_and_test - decrement and test
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int arch_atomic64_dec_and_test(atomic64_t *v)
-{
-	return arch_atomic64_dec_return(v) == 0;
-}
-
-/**
- * atomic64_inc_and_test - increment and test
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int arch_atomic64_inc_and_test(atomic64_t *v)
-{
-	return arch_atomic64_inc_return(v) == 0;
-}
-
-/**
- * arch_atomic64_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int arch_atomic64_add_negative(long long i, atomic64_t *v)
-{
-	return arch_atomic64_add_return(i, v) < 0;
-}
-
 /**
  * arch_atomic64_add_unless - add unless the number is a given value
  * @v: pointer of type atomic64_t
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 7e04b294e6eb..6a65228a3db6 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -71,6 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v)
  * true if the result is zero, or false for all
  * other cases.
  */
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
 static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
@@ -110,6 +111,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
  * returns true if the result is 0, or false for all other
  * cases.
  */
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
 static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
@@ -123,6 +125,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
  * and returns true if the result is zero, or false for all
  * other cases.
  */
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
 static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
@@ -137,6 +140,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
+#define arch_atomic64_add_negative arch_atomic64_add_negative
 static inline bool arch_atomic64_add_negative(long i, atomic64_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index f4c9f82c40c6..332ae4eca737 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -197,17 +197,6 @@ ATOMIC_OPS(xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_sub_and_test(i,v) (atomic_sub_return((i),(v)) == 0)
-
 /**
  * atomic_inc - increment atomic variable
  * @v: pointer of type atomic_t
@@ -240,37 +229,6 @@ ATOMIC_OPS(xor)
  */
 #define atomic_dec_return(v) atomic_sub_return(1,(v))
 
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-#define atomic_dec_and_test(v) (atomic_sub_return(1,(v)) == 0)
-
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_add_return(1,(v)) == 0)
-
-/**
- * atomic_add_negative - add and test if negative
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-#define atomic_add_negative(i,v) (atomic_add_return((i),(v)) < 0)
-
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 2b487f28ef35..6b64c200de73 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -225,29 +225,41 @@ static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v)
 	return arch_atomic64_dec_if_positive(v);
 }
 
+#ifdef arch_atomic_dec_and_test
+#define atomic_dec_and_test atomic_dec_and_test
 static __always_inline bool atomic_dec_and_test(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_dec_and_test(v);
 }
+#endif
 
+#ifdef arch_atomic64_dec_and_test
+#define atomic64_dec_and_test atomic64_dec_and_test
 static __always_inline bool atomic64_dec_and_test(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_dec_and_test(v);
 }
+#endif
 
+#ifdef arch_atomic_inc_and_test
+#define atomic_inc_and_test atomic_inc_and_test
 static __always_inline bool atomic_inc_and_test(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_inc_and_test(v);
 }
+#endif
 
+#ifdef arch_atomic64_inc_and_test
+#define atomic64_inc_and_test atomic64_inc_and_test
 static __always_inline bool atomic64_inc_and_test(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_inc_and_test(v);
 }
+#endif
 
 static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
@@ -333,29 +345,41 @@ static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v)
 	return arch_atomic64_fetch_xor(i, v);
 }
 
+#ifdef arch_atomic_sub_and_test
+#define atomic_sub_and_test atomic_sub_and_test
 static __always_inline bool atomic_sub_and_test(int i, atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_sub_and_test(i, v);
 }
+#endif
 
+#ifdef arch_atomic64_sub_and_test
+#define atomic64_sub_and_test atomic64_sub_and_test
 static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_sub_and_test(i, v);
 }
+#endif
 
+#ifdef arch_atomic_add_negative
+#define atomic_add_negative atomic_add_negative
 static __always_inline bool atomic_add_negative(int i, atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_add_negative(i, v);
 }
+#endif
 
+#ifdef arch_atomic64_add_negative
+#define atomic64_add_negative atomic64_add_negative
 static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_add_negative(i, v);
 }
+#endif
 
 static __always_inline unsigned long
 cmpxchg_size(volatile void *ptr, unsigned long old, unsigned long new, int size)
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 757e45821220..40cab858aaaa 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -186,11 +186,6 @@ ATOMIC_OP(xor, ^)
 
 #include <linux/irqflags.h>
 
-static inline int atomic_add_negative(int i, atomic_t *v)
-{
-	return atomic_add_return(i, v) < 0;
-}
-
 static inline void atomic_add(int i, atomic_t *v)
 {
 	atomic_add_return(i, v);
@@ -214,10 +209,6 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
 
-#define atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
-#define atomic_dec_and_test(v)		(atomic_dec_return(v) == 0)
-#define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
-
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index 49460107b29a..d3827ab97aa4 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -56,13 +56,9 @@ extern long long atomic64_xchg(atomic64_t *v, long long new);
 extern long long atomic64_fetch_add_unless(atomic64_t *v, long long a, long long u);
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 #define atomic64_inc(v)			atomic64_add(1LL, (v))
 #define atomic64_inc_return(v)		atomic64_add_return(1LL, (v))
-#define atomic64_inc_and_test(v) 	(atomic64_inc_return(v) == 0)
-#define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
 #define atomic64_dec(v)			atomic64_sub(1LL, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
-#define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 
 #endif  /*  _ASM_GENERIC_ATOMIC64_H  */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 530562ac7909..3ee8da9023cd 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -569,6 +569,68 @@ static inline bool atomic_add_unless(atomic_t *v, int a, int u)
 #define atomic_inc_not_zero(v)		atomic_add_unless((v), 1, 0)
 #endif
 
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+#ifndef atomic_inc_and_test
+static inline bool atomic_inc_and_test(atomic_t *v)
+{
+	return atomic_inc_return(v) == 0;
+}
+#endif
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+#ifndef atomic_dec_and_test
+static inline bool atomic_dec_and_test(atomic_t *v)
+{
+	return atomic_dec_return(v) == 0;
+}
+#endif
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+#ifndef atomic_sub_and_test
+static inline bool atomic_sub_and_test(int i, atomic_t *v)
+{
+	return atomic_sub_return(i, v) == 0;
+}
+#endif
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+#ifndef atomic_add_negative
+static inline bool atomic_add_negative(int i, atomic_t *v)
+{
+	return atomic_add_return(i, v) < 0;
+}
+#endif
+
 #ifndef atomic_andnot
 static inline void atomic_andnot(int i, atomic_t *v)
 {
@@ -1091,6 +1153,68 @@ static inline bool atomic64_add_unless(atomic64_t *v, long long a, long long u)
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
 #endif
 
+/**
+ * atomic64_inc_and_test - increment and test
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+#ifndef atomic64_inc_and_test
+static inline bool atomic64_inc_and_test(atomic64_t *v)
+{
+	return atomic64_inc_return(v) == 0;
+}
+#endif
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+#ifndef atomic64_dec_and_test
+static inline bool atomic64_dec_and_test(atomic64_t *v)
+{
+	return atomic64_dec_return(v) == 0;
+}
+#endif
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+#ifndef atomic64_sub_and_test
+static inline bool atomic64_sub_and_test(long long i, atomic64_t *v)
+{
+	return atomic64_sub_return(i, v) == 0;
+}
+#endif
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+#ifndef atomic64_add_negative
+static inline bool atomic64_add_negative(long long i, atomic64_t *v)
+{
+	return atomic64_add_return(i, v) < 0;
+}
+#endif
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
-- 
cgit v1.2.3


From 9837559d8eb01ce834e56fc9a567c1d94ebd3698 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:19 +0100
Subject: atomics/treewide: Make unconditional inc/dec ops optional

Many of the inc/dec ops are mandatory, but for most architectures inc/dec are
simply trivial wrappers around their corresponding add/sub ops.

Let's make all the inc/dec ops optional, so that we can get rid of these
boilerplate wrappers.

The instrumented atomics are updated accordingly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-17-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           | 12 -----
 arch/arc/include/asm/atomic.h             | 11 -----
 arch/arm/include/asm/atomic.h             | 11 -----
 arch/arm64/include/asm/atomic.h           | 24 ----------
 arch/h8300/include/asm/atomic.h           |  7 ---
 arch/hexagon/include/asm/atomic.h         |  6 ---
 arch/ia64/include/asm/atomic.h            |  9 ----
 arch/m68k/include/asm/atomic.h            |  5 +-
 arch/mips/include/asm/atomic.h            | 38 ----------------
 arch/parisc/include/asm/atomic.h          | 12 -----
 arch/powerpc/include/asm/atomic.h         |  4 ++
 arch/riscv/include/asm/atomic.h           | 76 -------------------------------
 arch/s390/include/asm/atomic.h            |  8 ----
 arch/sh/include/asm/atomic.h              |  6 ---
 arch/sparc/include/asm/atomic_32.h        |  5 --
 arch/sparc/include/asm/atomic_64.h        | 12 -----
 arch/x86/include/asm/atomic.h             |  5 +-
 arch/x86/include/asm/atomic64_32.h        |  4 ++
 arch/x86/include/asm/atomic64_64.h        |  5 +-
 arch/xtensa/include/asm/atomic.h          | 32 -------------
 include/asm-generic/atomic-instrumented.h | 24 ++++++++++
 include/asm-generic/atomic.h              | 13 ------
 include/asm-generic/atomic64.h            |  5 --
 include/linux/atomic.h                    | 48 +++++++++++++++++++
 24 files changed, 86 insertions(+), 296 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 25f8693c5a42..f6410cb68058 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -297,16 +297,4 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return old - 1;
 }
 
-#define atomic_dec_return(v) atomic_sub_return(1,(v))
-#define atomic64_dec_return(v) atomic64_sub_return(1,(v))
-
-#define atomic_inc_return(v) atomic_add_return(1,(v))
-#define atomic64_inc_return(v) atomic64_add_return(1,(v))
-
-#define atomic_inc(v) atomic_add(1,(v))
-#define atomic64_inc(v) atomic64_add(1,(v))
-
-#define atomic_dec(v) atomic_sub(1,(v))
-#define atomic64_dec(v) atomic64_sub(1,(v))
-
 #endif /* _ALPHA_ATOMIC_H */
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 4222e726f84c..27b95a928c1e 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -308,12 +308,6 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define atomic_inc(v)			atomic_add(1, v)
-#define atomic_dec(v)			atomic_sub(1, v)
-
-#define atomic_inc_return(v)		atomic_add_return(1, (v))
-#define atomic_dec_return(v)		atomic_sub_return(1, (v))
-
 #ifdef CONFIG_GENERIC_ATOMIC64
 
 #include <asm-generic/atomic64.h>
@@ -560,11 +554,6 @@ static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
 }
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_inc(v)			atomic64_add(1LL, (v))
-#define atomic64_inc_return(v)		atomic64_add_return(1LL, (v))
-#define atomic64_dec(v)			atomic64_sub(1LL, (v))
-#define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
-
 #endif	/* !CONFIG_GENERIC_ATOMIC64 */
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 35fb7f504daa..5a58d061d3d2 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -245,12 +245,6 @@ ATOMIC_OPS(xor, ^=, eor)
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
-#define atomic_inc(v)		atomic_add(1, v)
-#define atomic_dec(v)		atomic_sub(1, v)
-
-#define atomic_inc_return_relaxed(v)    (atomic_add_return_relaxed(1, v))
-#define atomic_dec_return_relaxed(v)    (atomic_sub_return_relaxed(1, v))
-
 #ifndef CONFIG_GENERIC_ATOMIC64
 typedef struct {
 	long long counter;
@@ -512,11 +506,6 @@ static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
 }
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_inc(v)			atomic64_add(1LL, (v))
-#define atomic64_inc_return_relaxed(v)	atomic64_add_return_relaxed(1LL, (v))
-#define atomic64_dec(v)			atomic64_sub(1LL, (v))
-#define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1LL, (v))
-
 #endif /* !CONFIG_GENERIC_ATOMIC64 */
 #endif
 #endif
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index edbe53fa3106..078f785cd97f 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -50,21 +50,11 @@
 #define atomic_add_return_release	atomic_add_return_release
 #define atomic_add_return		atomic_add_return
 
-#define atomic_inc_return_relaxed(v)	atomic_add_return_relaxed(1, (v))
-#define atomic_inc_return_acquire(v)	atomic_add_return_acquire(1, (v))
-#define atomic_inc_return_release(v)	atomic_add_return_release(1, (v))
-#define atomic_inc_return(v)		atomic_add_return(1, (v))
-
 #define atomic_sub_return_relaxed	atomic_sub_return_relaxed
 #define atomic_sub_return_acquire	atomic_sub_return_acquire
 #define atomic_sub_return_release	atomic_sub_return_release
 #define atomic_sub_return		atomic_sub_return
 
-#define atomic_dec_return_relaxed(v)	atomic_sub_return_relaxed(1, (v))
-#define atomic_dec_return_acquire(v)	atomic_sub_return_acquire(1, (v))
-#define atomic_dec_return_release(v)	atomic_sub_return_release(1, (v))
-#define atomic_dec_return(v)		atomic_sub_return(1, (v))
-
 #define atomic_fetch_add_relaxed	atomic_fetch_add_relaxed
 #define atomic_fetch_add_acquire	atomic_fetch_add_acquire
 #define atomic_fetch_add_release	atomic_fetch_add_release
@@ -108,8 +98,6 @@
 	cmpxchg_release(&((v)->counter), (old), (new))
 #define atomic_cmpxchg(v, old, new)	cmpxchg(&((v)->counter), (old), (new))
 
-#define atomic_inc(v)			atomic_add(1, (v))
-#define atomic_dec(v)			atomic_sub(1, (v))
 #define atomic_andnot			atomic_andnot
 
 /*
@@ -124,21 +112,11 @@
 #define atomic64_add_return_release	atomic64_add_return_release
 #define atomic64_add_return		atomic64_add_return
 
-#define atomic64_inc_return_relaxed(v)	atomic64_add_return_relaxed(1, (v))
-#define atomic64_inc_return_acquire(v)	atomic64_add_return_acquire(1, (v))
-#define atomic64_inc_return_release(v)	atomic64_add_return_release(1, (v))
-#define atomic64_inc_return(v)		atomic64_add_return(1, (v))
-
 #define atomic64_sub_return_relaxed	atomic64_sub_return_relaxed
 #define atomic64_sub_return_acquire	atomic64_sub_return_acquire
 #define atomic64_sub_return_release	atomic64_sub_return_release
 #define atomic64_sub_return		atomic64_sub_return
 
-#define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1, (v))
-#define atomic64_dec_return_acquire(v)	atomic64_sub_return_acquire(1, (v))
-#define atomic64_dec_return_release(v)	atomic64_sub_return_release(1, (v))
-#define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
-
 #define atomic64_fetch_add_relaxed	atomic64_fetch_add_relaxed
 #define atomic64_fetch_add_acquire	atomic64_fetch_add_acquire
 #define atomic64_fetch_add_release	atomic64_fetch_add_release
@@ -179,8 +157,6 @@
 #define atomic64_cmpxchg_release	atomic_cmpxchg_release
 #define atomic64_cmpxchg		atomic_cmpxchg
 
-#define atomic64_inc(v)			atomic64_add(1, (v))
-#define atomic64_dec(v)			atomic64_sub(1, (v))
 #define atomic64_andnot			atomic64_andnot
 
 #endif
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 8977b5157c8f..c6b6a06231b2 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -69,13 +69,6 @@ ATOMIC_OPS(sub, -=)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define atomic_inc_return(v)		atomic_add_return(1, v)
-#define atomic_dec_return(v)		atomic_sub_return(1, v)
-
-#define atomic_inc(v)			(void)atomic_inc_return(v)
-
-#define atomic_dec(v)			(void)atomic_dec_return(v)
-
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int ret;
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 31638f511674..311b9894ccc8 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -198,10 +198,4 @@ static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 }
 #define atomic_fetch_add_unless atomic_fetch_add_unless
 
-#define atomic_inc(v) atomic_add(1, (v))
-#define atomic_dec(v) atomic_sub(1, (v))
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
 #endif
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index e4143c462e65..46a15a974bed 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -231,19 +231,10 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-#define atomic_dec_return(v)		atomic_sub_return(1, (v))
-#define atomic_inc_return(v)		atomic_add_return(1, (v))
-#define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
-#define atomic64_inc_return(v)		atomic64_add_return(1, (v))
-
 #define atomic_add(i,v)			(void)atomic_add_return((i), (v))
 #define atomic_sub(i,v)			(void)atomic_sub_return((i), (v))
-#define atomic_inc(v)			atomic_add(1, (v))
-#define atomic_dec(v)			atomic_sub(1, (v))
 
 #define atomic64_add(i,v)		(void)atomic64_add_return((i), (v))
 #define atomic64_sub(i,v)		(void)atomic64_sub_return((i), (v))
-#define atomic64_inc(v)			atomic64_add(1, (v))
-#define atomic64_dec(v)			atomic64_sub(1, (v))
 
 #endif /* _ASM_IA64_ATOMIC_H */
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 9df09c876fa2..47228b0d4163 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -126,11 +126,13 @@ static inline void atomic_inc(atomic_t *v)
 {
 	__asm__ __volatile__("addql #1,%0" : "+m" (*v));
 }
+#define atomic_inc atomic_inc
 
 static inline void atomic_dec(atomic_t *v)
 {
 	__asm__ __volatile__("subql #1,%0" : "+m" (*v));
 }
+#define atomic_dec atomic_dec
 
 static inline int atomic_dec_and_test(atomic_t *v)
 {
@@ -192,9 +194,6 @@ static inline int atomic_xchg(atomic_t *v, int new)
 
 #endif /* !CONFIG_RMW_INSNS */
 
-#define atomic_dec_return(v)	atomic_sub_return(1, (v))
-#define atomic_inc_return(v)	atomic_add_return(1, (v))
-
 static inline int atomic_sub_and_test(int i, atomic_t *v)
 {
 	char c;
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index fd3008ae164c..79be687de4ab 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -274,31 +274,12 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v)
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
 
-#define atomic_dec_return(v) atomic_sub_return(1, (v))
-#define atomic_inc_return(v) atomic_add_return(1, (v))
-
 /*
  * atomic_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic_t
  */
 #define atomic_dec_if_positive(v)	atomic_sub_if_positive(1, v)
 
-/*
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-#define atomic_inc(v) atomic_add(1, (v))
-
-/*
- * atomic_dec - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-#define atomic_dec(v) atomic_sub(1, (v))
-
 #ifdef CONFIG_64BIT
 
 #define ATOMIC64_INIT(i)    { (i) }
@@ -554,31 +535,12 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), (new)))
 
-#define atomic64_dec_return(v) atomic64_sub_return(1, (v))
-#define atomic64_inc_return(v) atomic64_add_return(1, (v))
-
 /*
  * atomic64_dec_if_positive - decrement by 1 if old value positive
  * @v: pointer of type atomic64_t
  */
 #define atomic64_dec_if_positive(v)	atomic64_sub_if_positive(1, v)
 
-/*
- * atomic64_inc - increment atomic variable
- * @v: pointer of type atomic64_t
- *
- * Atomically increments @v by 1.
- */
-#define atomic64_inc(v) atomic64_add(1, (v))
-
-/*
- * atomic64_dec - decrement and test
- * @v: pointer of type atomic64_t
- *
- * Atomically decrements @v by 1.
- */
-#define atomic64_dec(v) atomic64_sub(1, (v))
-
 #endif /* CONFIG_64BIT */
 
 #endif /* _ASM_ATOMIC_H */
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index f85844ff6336..10bc490327c1 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -136,12 +136,6 @@ ATOMIC_OPS(xor, ^=)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define atomic_inc(v)	(atomic_add(   1,(v)))
-#define atomic_dec(v)	(atomic_add(  -1,(v)))
-
-#define atomic_inc_return(v)	(atomic_add_return(   1,(v)))
-#define atomic_dec_return(v)	(atomic_add_return(  -1,(v)))
-
 #define ATOMIC_INIT(i)	{ (i) }
 
 #ifdef CONFIG_64BIT
@@ -224,12 +218,6 @@ atomic64_read(const atomic64_t *v)
 	return READ_ONCE((v)->counter);
 }
 
-#define atomic64_inc(v)		(atomic64_add(   1,(v)))
-#define atomic64_dec(v)		(atomic64_add(  -1,(v)))
-
-#define atomic64_inc_return(v)		(atomic64_add_return(   1,(v)))
-#define atomic64_dec_return(v)		(atomic64_add_return(  -1,(v)))
-
 /* exported interface */
 #define atomic64_cmpxchg(v, o, n) \
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 5d76f05d2be3..ebaefdee4a57 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -143,6 +143,7 @@ static __inline__ void atomic_inc(atomic_t *v)
 	: "r" (&v->counter)
 	: "cc", "xer");
 }
+#define atomic_inc atomic_inc
 
 static __inline__ int atomic_inc_return_relaxed(atomic_t *v)
 {
@@ -175,6 +176,7 @@ static __inline__ void atomic_dec(atomic_t *v)
 	: "r" (&v->counter)
 	: "cc", "xer");
 }
+#define atomic_dec atomic_dec
 
 static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
 {
@@ -411,6 +413,7 @@ static __inline__ void atomic64_inc(atomic64_t *v)
 	: "r" (&v->counter)
 	: "cc", "xer");
 }
+#define atomic64_inc atomic64_inc
 
 static __inline__ long atomic64_inc_return_relaxed(atomic64_t *v)
 {
@@ -441,6 +444,7 @@ static __inline__ void atomic64_dec(atomic64_t *v)
 	: "r" (&v->counter)
 	: "cc", "xer");
 }
+#define atomic64_dec atomic64_dec
 
 static __inline__ long atomic64_dec_return_relaxed(atomic64_t *v)
 {
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 68eef0a805ca..512b89485790 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -209,82 +209,6 @@ ATOMIC_OPS(xor, xor, i)
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 
-#define ATOMIC_OP(op, func_op, I, c_type, prefix)			\
-static __always_inline							\
-void atomic##prefix##_##op(atomic##prefix##_t *v)			\
-{									\
-	atomic##prefix##_##func_op(I, v);				\
-}
-
-#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)			\
-static __always_inline							\
-c_type atomic##prefix##_fetch_##op##_relaxed(atomic##prefix##_t *v)	\
-{									\
-	return atomic##prefix##_fetch_##func_op##_relaxed(I, v);	\
-}									\
-static __always_inline							\
-c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)		\
-{									\
-	return atomic##prefix##_fetch_##func_op(I, v);			\
-}
-
-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)		\
-static __always_inline							\
-c_type atomic##prefix##_##op##_return_relaxed(atomic##prefix##_t *v)	\
-{									\
-        return atomic##prefix##_fetch_##op##_relaxed(v) c_op I;		\
-}									\
-static __always_inline							\
-c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)		\
-{									\
-        return atomic##prefix##_fetch_##op(v) c_op I;			\
-}
-
-#ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I)					\
-        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
-        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )
-#else
-#define ATOMIC_OPS(op, asm_op, c_op, I)					\
-        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
-        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )			\
-        ATOMIC_OP(       op, asm_op,       I, long, 64)			\
-        ATOMIC_FETCH_OP( op, asm_op,       I, long, 64)			\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
-#endif
-
-ATOMIC_OPS(inc, add, +,  1)
-ATOMIC_OPS(dec, add, +, -1)
-
-#define atomic_inc_return_relaxed	atomic_inc_return_relaxed
-#define atomic_dec_return_relaxed	atomic_dec_return_relaxed
-#define atomic_inc_return		atomic_inc_return
-#define atomic_dec_return		atomic_dec_return
-
-#define atomic_fetch_inc_relaxed	atomic_fetch_inc_relaxed
-#define atomic_fetch_dec_relaxed	atomic_fetch_dec_relaxed
-#define atomic_fetch_inc		atomic_fetch_inc
-#define atomic_fetch_dec		atomic_fetch_dec
-
-#ifndef CONFIG_GENERIC_ATOMIC64
-#define atomic64_inc_return_relaxed	atomic64_inc_return_relaxed
-#define atomic64_dec_return_relaxed	atomic64_dec_return_relaxed
-#define atomic64_inc_return		atomic64_inc_return
-#define atomic64_dec_return		atomic64_dec_return
-
-#define atomic64_fetch_inc_relaxed	atomic64_fetch_inc_relaxed
-#define atomic64_fetch_dec_relaxed	atomic64_fetch_dec_relaxed
-#define atomic64_fetch_inc		atomic64_fetch_inc
-#define atomic64_fetch_dec		atomic64_fetch_dec
-#endif
-
-#undef ATOMIC_OPS
-#undef ATOMIC_OP
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-
 /* This is required to provide a full barrier on success. */
 static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 7f5fbd595f01..376e64af951f 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -55,13 +55,9 @@ static inline void atomic_add(int i, atomic_t *v)
 	__atomic_add(i, &v->counter);
 }
 
-#define atomic_inc(_v)			atomic_add(1, _v)
-#define atomic_inc_return(_v)		atomic_add_return(1, _v)
 #define atomic_sub(_i, _v)		atomic_add(-(int)(_i), _v)
 #define atomic_sub_return(_i, _v)	atomic_add_return(-(int)(_i), _v)
 #define atomic_fetch_sub(_i, _v)	atomic_fetch_add(-(int)(_i), _v)
-#define atomic_dec(_v)			atomic_sub(1, _v)
-#define atomic_dec_return(_v)		atomic_sub_return(1, _v)
 
 #define ATOMIC_OPS(op)							\
 static inline void atomic_##op(int i, atomic_t *v)			\
@@ -166,12 +162,8 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-#define atomic64_inc(_v)		atomic64_add(1, _v)
-#define atomic64_inc_return(_v)		atomic64_add_return(1, _v)
 #define atomic64_sub_return(_i, _v)	atomic64_add_return(-(long)(_i), _v)
 #define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(long)(_i), _v)
 #define atomic64_sub(_i, _v)		atomic64_add(-(long)(_i), _v)
-#define atomic64_dec(_v)		atomic64_sub(1, _v)
-#define atomic64_dec_return(_v)		atomic64_sub_return(1, _v)
 
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index d438494fa112..f37b95a80232 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -32,12 +32,6 @@
 #include <asm/atomic-irq.h>
 #endif
 
-#define atomic_dec_return(v)		atomic_sub_return(1, (v))
-#define atomic_inc_return(v)		atomic_add_return(1, (v))
-
-#define atomic_inc(v)			atomic_add(1, (v))
-#define atomic_dec(v)			atomic_sub(1, (v))
-
 #define atomic_xchg(v, new)		(xchg(&((v)->counter), new))
 #define atomic_cmpxchg(v, o, n)		(cmpxchg(&((v)->counter), (o), (n)))
 
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 3a26573790c6..94c930f0bc62 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -38,8 +38,6 @@ void atomic_set(atomic_t *, int);
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
 #define atomic_sub(i, v)	((void)atomic_add_return(-(int)(i), (v)))
-#define atomic_inc(v)		((void)atomic_add_return(        1, (v)))
-#define atomic_dec(v)		((void)atomic_add_return(       -1, (v)))
 
 #define atomic_and(i, v)	((void)atomic_fetch_and((i), (v)))
 #define atomic_or(i, v)		((void)atomic_fetch_or((i), (v)))
@@ -48,7 +46,4 @@ void atomic_set(atomic_t *, int);
 #define atomic_sub_return(i, v)	(atomic_add_return(-(int)(i), (v)))
 #define atomic_fetch_sub(i, v)  (atomic_fetch_add (-(int)(i), (v)))
 
-#define atomic_inc_return(v)	(atomic_add_return(        1, (v)))
-#define atomic_dec_return(v)	(atomic_add_return(       -1, (v)))
-
 #endif /* !(__ARCH_SPARC_ATOMIC__) */
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 634508282aea..304865c7cdbb 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -50,18 +50,6 @@ ATOMIC_OPS(xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define atomic_dec_return(v)   atomic_sub_return(1, v)
-#define atomic64_dec_return(v) atomic64_sub_return(1, v)
-
-#define atomic_inc_return(v)   atomic_add_return(1, v)
-#define atomic64_inc_return(v) atomic64_add_return(1, v)
-
-#define atomic_inc(v) atomic_add(1, v)
-#define atomic64_inc(v) atomic64_add(1, v)
-
-#define atomic_dec(v) atomic_sub(1, v)
-#define atomic64_dec(v) atomic64_sub(1, v)
-
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 
 static inline int atomic_xchg(atomic_t *v, int new)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 73bda4abe180..823fd2f320cf 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -92,6 +92,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
  *
  * Atomically increments @v by 1.
  */
+#define arch_atomic_inc arch_atomic_inc
 static __always_inline void arch_atomic_inc(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "incl %0"
@@ -104,6 +105,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v)
  *
  * Atomically decrements @v by 1.
  */
+#define arch_atomic_dec arch_atomic_dec
 static __always_inline void arch_atomic_dec(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "decl %0"
@@ -177,9 +179,6 @@ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
 	return arch_atomic_add_return(-i, v);
 }
 
-#define arch_atomic_inc_return(v)  (arch_atomic_add_return(1, v))
-#define arch_atomic_dec_return(v)  (arch_atomic_sub_return(1, v))
-
 static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 {
 	return xadd(&v->counter, i);
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a26810d005e0..472c7af0ed48 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -158,6 +158,7 @@ static inline long long arch_atomic64_inc_return(atomic64_t *v)
 			     "S" (v) : "memory", "ecx");
 	return a;
 }
+#define arch_atomic64_inc_return arch_atomic64_inc_return
 
 static inline long long arch_atomic64_dec_return(atomic64_t *v)
 {
@@ -166,6 +167,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v)
 			     "S" (v) : "memory", "ecx");
 	return a;
 }
+#define arch_atomic64_dec_return arch_atomic64_dec_return
 
 /**
  * arch_atomic64_add - add integer to atomic64 variable
@@ -203,6 +205,7 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v)
  *
  * Atomically increments @v by 1.
  */
+#define arch_atomic64_inc arch_atomic64_inc
 static inline void arch_atomic64_inc(atomic64_t *v)
 {
 	__alternative_atomic64(inc, inc_return, /* no output */,
@@ -215,6 +218,7 @@ static inline void arch_atomic64_inc(atomic64_t *v)
  *
  * Atomically decrements @v by 1.
  */
+#define arch_atomic64_dec arch_atomic64_dec
 static inline void arch_atomic64_dec(atomic64_t *v)
 {
 	__alternative_atomic64(dec, dec_return, /* no output */,
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6a65228a3db6..1b282272a801 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -83,6 +83,7 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
  *
  * Atomically increments @v by 1.
  */
+#define arch_atomic64_inc arch_atomic64_inc
 static __always_inline void arch_atomic64_inc(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "incq %0"
@@ -96,6 +97,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
  *
  * Atomically decrements @v by 1.
  */
+#define arch_atomic64_dec arch_atomic64_dec
 static __always_inline void arch_atomic64_dec(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "decq %0"
@@ -173,9 +175,6 @@ static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v)
 	return xadd(&v->counter, -i);
 }
 
-#define arch_atomic64_inc_return(v)  (arch_atomic64_add_return(1, (v)))
-#define arch_atomic64_dec_return(v)  (arch_atomic64_sub_return(1, (v)))
-
 static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new)
 {
 	return arch_cmpxchg(&v->counter, old, new);
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 332ae4eca737..7de0149e1cf7 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -197,38 +197,6 @@ ATOMIC_OPS(xor)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-#define atomic_inc(v) atomic_add(1,(v))
-
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-#define atomic_inc_return(v) atomic_add_return(1,(v))
-
-/**
- * atomic_dec - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-#define atomic_dec(v) atomic_sub(1,(v))
-
-/**
- * atomic_dec_return - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-#define atomic_dec_return(v) atomic_sub_return(1,(v))
-
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 6b64c200de73..12f9634750d7 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -102,29 +102,41 @@ static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u
 }
 #endif
 
+#ifdef arch_atomic_inc
+#define atomic_inc atomic_inc
 static __always_inline void atomic_inc(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	arch_atomic_inc(v);
 }
+#endif
 
+#ifdef arch_atomic64_inc
+#define atomic64_inc atomic64_inc
 static __always_inline void atomic64_inc(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	arch_atomic64_inc(v);
 }
+#endif
 
+#ifdef arch_atomic_dec
+#define atomic_dec atomic_dec
 static __always_inline void atomic_dec(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	arch_atomic_dec(v);
 }
+#endif
 
+#ifdef atch_atomic64_dec
+#define atomic64_dec
 static __always_inline void atomic64_dec(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	arch_atomic64_dec(v);
 }
+#endif
 
 static __always_inline void atomic_add(int i, atomic_t *v)
 {
@@ -186,29 +198,41 @@ static __always_inline void atomic64_xor(s64 i, atomic64_t *v)
 	arch_atomic64_xor(i, v);
 }
 
+#ifdef arch_atomic_inc_return
+#define atomic_inc_return atomic_inc_return
 static __always_inline int atomic_inc_return(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_inc_return(v);
 }
+#endif
 
+#ifdef arch_atomic64_in_return
+#define atomic64_inc_return atomic64_inc_return
 static __always_inline s64 atomic64_inc_return(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_inc_return(v);
 }
+#endif
 
+#ifdef arch_atomic_dec_return
+#define atomic_dec_return atomic_dec_return
 static __always_inline int atomic_dec_return(atomic_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic_dec_return(v);
 }
+#endif
 
+#ifdef arch_atomic64_dec_return
+#define atomic64_dec_return atomic64_dec_return
 static __always_inline s64 atomic64_dec_return(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_dec_return(v);
 }
+#endif
 
 #ifdef arch_atomic64_inc_not_zero
 #define atomic64_inc_not_zero atomic64_inc_not_zero
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 40cab858aaaa..13324aa828eb 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -196,19 +196,6 @@ static inline void atomic_sub(int i, atomic_t *v)
 	atomic_sub_return(i, v);
 }
 
-static inline void atomic_inc(atomic_t *v)
-{
-	atomic_add_return(1, v);
-}
-
-static inline void atomic_dec(atomic_t *v)
-{
-	atomic_sub_return(1, v);
-}
-
-#define atomic_dec_return(v)		atomic_sub_return(1, (v))
-#define atomic_inc_return(v)		atomic_add_return(1, (v))
-
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index d3827ab97aa4..242b79ae0b57 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -56,9 +56,4 @@ extern long long atomic64_xchg(atomic64_t *v, long long new);
 extern long long atomic64_fetch_add_unless(atomic64_t *v, long long a, long long u);
 #define atomic64_fetch_add_unless atomic64_fetch_add_unless
 
-#define atomic64_inc(v)			atomic64_add(1LL, (v))
-#define atomic64_inc_return(v)		atomic64_add_return(1LL, (v))
-#define atomic64_dec(v)			atomic64_sub(1LL, (v))
-#define atomic64_dec_return(v)		atomic64_sub_return(1LL, (v))
-
 #endif  /*  _ASM_GENERIC_ATOMIC64_H  */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 3ee8da9023cd..24f345df7ba6 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -97,11 +97,23 @@
 #endif
 #endif /* atomic_add_return_relaxed */
 
+#ifndef atomic_inc
+#define atomic_inc(v)			atomic_add(1, (v))
+#endif
+
 /* atomic_inc_return_relaxed */
 #ifndef atomic_inc_return_relaxed
+
+#ifndef atomic_inc_return
+#define atomic_inc_return(v)		atomic_add_return(1, (v))
+#define atomic_inc_return_relaxed(v)	atomic_add_return_relaxed(1, (v))
+#define atomic_inc_return_acquire(v)	atomic_add_return_acquire(1, (v))
+#define atomic_inc_return_release(v)	atomic_add_return_release(1, (v))
+#else /* atomic_inc_return */
 #define  atomic_inc_return_relaxed	atomic_inc_return
 #define  atomic_inc_return_acquire	atomic_inc_return
 #define  atomic_inc_return_release	atomic_inc_return
+#endif /* atomic_inc_return */
 
 #else /* atomic_inc_return_relaxed */
 
@@ -145,11 +157,23 @@
 #endif
 #endif /* atomic_sub_return_relaxed */
 
+#ifndef atomic_dec
+#define atomic_dec(v)			atomic_sub(1, (v))
+#endif
+
 /* atomic_dec_return_relaxed */
 #ifndef atomic_dec_return_relaxed
+
+#ifndef atomic_dec_return
+#define atomic_dec_return(v)		atomic_sub_return(1, (v))
+#define atomic_dec_return_relaxed(v)	atomic_sub_return_relaxed(1, (v))
+#define atomic_dec_return_acquire(v)	atomic_sub_return_acquire(1, (v))
+#define atomic_dec_return_release(v)	atomic_sub_return_release(1, (v))
+#else /* atomic_dec_return */
 #define  atomic_dec_return_relaxed	atomic_dec_return
 #define  atomic_dec_return_acquire	atomic_dec_return
 #define  atomic_dec_return_release	atomic_dec_return
+#endif /* atomic_dec_return */
 
 #else /* atomic_dec_return_relaxed */
 
@@ -748,11 +772,23 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_add_return_relaxed */
 
+#ifndef atomic64_inc
+#define atomic64_inc(v)			atomic64_add(1, (v))
+#endif
+
 /* atomic64_inc_return_relaxed */
 #ifndef atomic64_inc_return_relaxed
+
+#ifndef atomic64_inc_return
+#define atomic64_inc_return(v)		atomic64_add_return(1, (v))
+#define atomic64_inc_return_relaxed(v)	atomic64_add_return_relaxed(1, (v))
+#define atomic64_inc_return_acquire(v)	atomic64_add_return_acquire(1, (v))
+#define atomic64_inc_return_release(v)	atomic64_add_return_release(1, (v))
+#else /* atomic64_inc_return */
 #define  atomic64_inc_return_relaxed	atomic64_inc_return
 #define  atomic64_inc_return_acquire	atomic64_inc_return
 #define  atomic64_inc_return_release	atomic64_inc_return
+#endif /* atomic64_inc_return */
 
 #else /* atomic64_inc_return_relaxed */
 
@@ -797,11 +833,23 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_sub_return_relaxed */
 
+#ifndef atomic64_dec
+#define atomic64_dec(v)			atomic64_sub(1, (v))
+#endif
+
 /* atomic64_dec_return_relaxed */
 #ifndef atomic64_dec_return_relaxed
+
+#ifndef atomic64_dec_return
+#define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
+#define atomic64_dec_return_relaxed(v)	atomic64_sub_return_relaxed(1, (v))
+#define atomic64_dec_return_acquire(v)	atomic64_sub_return_acquire(1, (v))
+#define atomic64_dec_return_release(v)	atomic64_sub_return_release(1, (v))
+#else /* atomic64_dec_return */
 #define  atomic64_dec_return_relaxed	atomic64_dec_return
 #define  atomic64_dec_return_acquire	atomic64_dec_return
 #define  atomic64_dec_return_release	atomic64_dec_return
+#endif /* atomic64_dec_return */
 
 #else /* atomic64_dec_return_relaxed */
 
-- 
cgit v1.2.3


From b3a2a05f9111de0b79312e577608a27b0318c0a1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:20 +0100
Subject: atomics/treewide: Make conditional inc/dec ops optional

The conditional inc/dec ops differ for atomic_t and atomic64_t:

- atomic_inc_unless_positive() is optional for atomic_t, and doesn't exist for atomic64_t.
- atomic_dec_unless_negative() is optional for atomic_t, and doesn't exist for atomic64_t.
- atomic_dec_if_positive is optional for atomic_t, and is mandatory for atomic64_t.

Let's make these consistently optional for both. At the same time, let's
clean up the existing fallbacks to use atomic_try_cmpxchg().

The instrumented atomics are updated accordingly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-18-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h           |  1 +
 arch/arc/include/asm/atomic.h             |  1 +
 arch/arm/include/asm/atomic.h             |  1 +
 arch/arm64/include/asm/atomic.h           |  2 +
 arch/ia64/include/asm/atomic.h            | 16 -----
 arch/parisc/include/asm/atomic.h          | 23 --------
 arch/powerpc/include/asm/atomic.h         |  1 +
 arch/s390/include/asm/atomic.h            | 17 ------
 arch/sparc/include/asm/atomic_64.h        |  1 +
 arch/x86/include/asm/atomic64_32.h        |  1 +
 arch/x86/include/asm/atomic64_64.h        | 18 ------
 include/asm-generic/atomic-instrumented.h |  3 +
 include/asm-generic/atomic64.h            |  1 +
 include/linux/atomic.h                    | 97 +++++++++++++++++++++++--------
 14 files changed, 85 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index f6410cb68058..4a6a8f58c9c9 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -296,5 +296,6 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	smp_mb();
 	return old - 1;
 }
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 
 #endif /* _ALPHA_ATOMIC_H */
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 27b95a928c1e..8f64f3b79b8a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -517,6 +517,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 
 	return val;
 }
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 
 /**
  * atomic64_fetch_add_unless - add unless the number is a given value
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 5a58d061d3d2..884c241424fe 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -474,6 +474,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 
 	return result;
 }
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 
 static inline long long atomic64_fetch_add_unless(atomic64_t *v, long long a,
 						  long long u)
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 078f785cd97f..9bca54dda75c 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -159,5 +159,7 @@
 
 #define atomic64_andnot			atomic64_andnot
 
+#define atomic64_dec_if_positive	atomic64_dec_if_positive
+
 #endif
 #endif
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 46a15a974bed..206530d0751b 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -215,22 +215,6 @@ ATOMIC64_FETCH_OP(xor, ^)
 	(cmpxchg(&((v)->counter), old, new))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
-{
-	long c, old, dec;
-	c = atomic64_read(v);
-	for (;;) {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-		old = atomic64_cmpxchg((v), c, dec);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return dec;
-}
-
 #define atomic_add(i,v)			(void)atomic_add_return((i), (v))
 #define atomic_sub(i,v)			(void)atomic_sub_return((i), (v))
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 10bc490327c1..118953d41763 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -223,29 +223,6 @@ atomic64_read(const atomic64_t *v)
 	((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-/*
- * atomic64_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-static inline long atomic64_dec_if_positive(atomic64_t *v)
-{
-	long c, old, dec;
-	c = atomic64_read(v);
-	for (;;) {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-		old = atomic64_cmpxchg((v), c, dec);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return dec;
-}
-
 #endif /* !CONFIG_64BIT */
 
 
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index ebaefdee4a57..a0156cb43d1f 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -488,6 +488,7 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 
 	return t;
 }
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic64_cmpxchg_relaxed(v, o, n) \
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 376e64af951f..fd20ab5d4cf7 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -145,23 +145,6 @@ ATOMIC64_OPS(xor)
 
 #undef ATOMIC64_OPS
 
-static inline long atomic64_dec_if_positive(atomic64_t *v)
-{
-	long c, old, dec;
-
-	c = atomic64_read(v);
-	for (;;) {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-		old = atomic64_cmpxchg((v), c, dec);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
-	return dec;
-}
-
 #define atomic64_sub_return(_i, _v)	atomic64_add_return(-(long)(_i), _v)
 #define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(long)(_i), _v)
 #define atomic64_sub(_i, _v)		atomic64_add(-(long)(_i), _v)
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index 304865c7cdbb..6963482c81d8 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -62,5 +62,6 @@ static inline int atomic_xchg(atomic_t *v, int new)
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
 long atomic64_dec_if_positive(atomic64_t *v);
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 
 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 472c7af0ed48..ef959f02d070 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -254,6 +254,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
 	return r;
 }
 
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
 static inline long long arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	long long r;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 1b282272a801..849f1c566a11 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -191,24 +191,6 @@ static inline long arch_atomic64_xchg(atomic64_t *v, long new)
 	return xchg(&v->counter, new);
 }
 
-/*
- * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-static inline long arch_atomic64_dec_if_positive(atomic64_t *v)
-{
-	s64 dec, c = arch_atomic64_read(v);
-	do {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-	} while (!arch_atomic64_try_cmpxchg(v, &c, dec));
-	return dec;
-}
-
 static inline void arch_atomic64_and(long i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "andq %1,%0"
diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h
index 12f9634750d7..3c64e95d5ed0 100644
--- a/include/asm-generic/atomic-instrumented.h
+++ b/include/asm-generic/atomic-instrumented.h
@@ -243,11 +243,14 @@ static __always_inline bool atomic64_inc_not_zero(atomic64_t *v)
 }
 #endif
 
+#ifdef arch_atomic64_dec_if_positive
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v)
 {
 	kasan_check_write(v, sizeof(*v));
 	return arch_atomic64_dec_if_positive(v);
 }
+#endif
 
 #ifdef arch_atomic_dec_and_test
 #define atomic_dec_and_test atomic_dec_and_test
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index 242b79ae0b57..97b28b7f1f29 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -51,6 +51,7 @@ ATOMIC64_OPS(xor)
 #undef ATOMIC64_OP
 
 extern long long atomic64_dec_if_positive(atomic64_t *v);
+#define atomic64_dec_if_positive atomic64_dec_if_positive
 extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
 extern long long atomic64_xchg(atomic64_t *v, long long new);
 extern long long atomic64_fetch_add_unless(atomic64_t *v, long long a, long long u);
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 24f345df7ba6..93fe5b4041e1 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -683,28 +683,30 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v)
 #endif
 
 #ifndef atomic_inc_unless_negative
-static inline bool atomic_inc_unless_negative(atomic_t *p)
+static inline bool atomic_inc_unless_negative(atomic_t *v)
 {
-	int v, v1;
-	for (v = 0; v >= 0; v = v1) {
-		v1 = atomic_cmpxchg(p, v, v + 1);
-		if (likely(v1 == v))
-			return true;
-	}
-	return false;
+	int c = atomic_read(v);
+
+	do {
+		if (unlikely(c < 0))
+			return false;
+	} while (!atomic_try_cmpxchg(v, &c, c + 1));
+
+	return true;
 }
 #endif
 
 #ifndef atomic_dec_unless_positive
-static inline bool atomic_dec_unless_positive(atomic_t *p)
+static inline bool atomic_dec_unless_positive(atomic_t *v)
 {
-	int v, v1;
-	for (v = 0; v <= 0; v = v1) {
-		v1 = atomic_cmpxchg(p, v, v - 1);
-		if (likely(v1 == v))
-			return true;
-	}
-	return false;
+	int c = atomic_read(v);
+
+	do {
+		if (unlikely(c > 0))
+			return false;
+	} while (!atomic_try_cmpxchg(v, &c, c - 1));
+
+	return true;
 }
 #endif
 
@@ -718,17 +720,14 @@ static inline bool atomic_dec_unless_positive(atomic_t *p)
 #ifndef atomic_dec_if_positive
 static inline int atomic_dec_if_positive(atomic_t *v)
 {
-	int c, old, dec;
-	c = atomic_read(v);
-	for (;;) {
+	int dec, c = atomic_read(v);
+
+	do {
 		dec = c - 1;
 		if (unlikely(dec < 0))
 			break;
-		old = atomic_cmpxchg((v), c, dec);
-		if (likely(old == c))
-			break;
-		c = old;
-	}
+	} while (!atomic_try_cmpxchg(v, &c, dec));
+
 	return dec;
 }
 #endif
@@ -1290,6 +1289,56 @@ static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v
 }
 #endif
 
+#ifndef atomic64_inc_unless_negative
+static inline bool atomic64_inc_unless_negative(atomic64_t *v)
+{
+	long long c = atomic64_read(v);
+
+	do {
+		if (unlikely(c < 0))
+			return false;
+	} while (!atomic64_try_cmpxchg(v, &c, c + 1));
+
+	return true;
+}
+#endif
+
+#ifndef atomic64_dec_unless_positive
+static inline bool atomic64_dec_unless_positive(atomic64_t *v)
+{
+	long long c = atomic64_read(v);
+
+	do {
+		if (unlikely(c > 0))
+			return false;
+	} while (!atomic64_try_cmpxchg(v, &c, c - 1));
+
+	return true;
+}
+#endif
+
+/*
+ * atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic64_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic64 variable, v, was not decremented.
+ */
+#ifndef atomic64_dec_if_positive
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long long dec, c = atomic64_read(v);
+
+	do {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+	} while (!atomic64_try_cmpxchg(v, &c, dec));
+
+	return dec;
+}
+#endif
+
 #define atomic64_cond_read_relaxed(v, c)	smp_cond_load_relaxed(&(v)->counter, (c))
 #define atomic64_cond_read_acquire(v, c)	smp_cond_load_acquire(&(v)->counter, (c))
 
-- 
cgit v1.2.3


From 7cc7eaad49c30ac165ecf84d95b26f7e0d53bd97 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 21 Jun 2018 13:13:21 +0100
Subject: atomics/treewide: Clean up '*_andnot()' ifdeffery

The ifdeffery for atomic*_{fetch_,}andnot() is unlike that for all the
other atomics. If atomic*_andnot() is not defined, the corresponding
atomic*_fetch_andnot() is assumed to not be defined.

Additionally, the fallbacks for the various ordering cases are written
much later in atomic.h as static inlines.

This isn't problematic today, but gets in the way of scripting the
generation of atomics. To prepare for scripting, this patch:

* Switches to separate ifdefs for atomic*_andnot() and
  atomic*_fetch_andnot(), updating implementations as appropriate.

* Moves the fallbacks into the standards ifdefs, as macro expansions
  rather than static inlines.

* Removes trivial andnot implementations from architectures, where these
  are superseded by core code.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/20180621121321.4761-19-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/include/asm/atomic.h |  8 ++--
 arch/arm/include/asm/atomic.h |  2 +
 include/linux/atomic.h        | 96 ++++++++++++++-----------------------------
 3 files changed, 36 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 8f64f3b79b8a..4e0072730241 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -187,7 +187,8 @@ static inline int atomic_fetch_##op(int i, atomic_t *v)			\
 ATOMIC_OPS(add, +=, add)
 ATOMIC_OPS(sub, -=, sub)
 
-#define atomic_andnot atomic_andnot
+#define atomic_andnot		atomic_andnot
+#define atomic_fetch_andnot	atomic_fetch_andnot
 
 #undef ATOMIC_OPS
 #define ATOMIC_OPS(op, c_op, asm_op)					\
@@ -296,8 +297,6 @@ ATOMIC_OPS(add, +=, CTOP_INST_AADD_DI_R2_R2_R3)
 	ATOMIC_FETCH_OP(op, c_op, asm_op)
 
 ATOMIC_OPS(and, &=, CTOP_INST_AAND_DI_R2_R2_R3)
-#define atomic_andnot(mask, v) atomic_and(~(mask), (v))
-#define atomic_fetch_andnot(mask, v) atomic_fetch_and(~(mask), (v))
 ATOMIC_OPS(or, |=, CTOP_INST_AOR_DI_R2_R2_R3)
 ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3)
 
@@ -430,7 +429,8 @@ static inline long long atomic64_fetch_##op(long long a, atomic64_t *v)	\
 	ATOMIC64_OP_RETURN(op, op1, op2)				\
 	ATOMIC64_FETCH_OP(op, op1, op2)
 
-#define atomic64_andnot atomic64_andnot
+#define atomic64_andnot		atomic64_andnot
+#define atomic64_fetch_andnot	atomic64_fetch_andnot
 
 ATOMIC64_OPS(add, add.f, adc)
 ATOMIC64_OPS(sub, sub.f, sbc)
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 884c241424fe..f74756641410 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -216,6 +216,8 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return ret;
 }
 
+#define atomic_fetch_andnot		atomic_fetch_andnot
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define ATOMIC_OPS(op, c_op, asm_op)					\
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 93fe5b4041e1..8e04f1f69bd9 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -354,12 +354,22 @@
 #endif
 #endif /* atomic_fetch_and_relaxed */
 
-#ifdef atomic_andnot
-/* atomic_fetch_andnot_relaxed */
+#ifndef atomic_andnot
+#define atomic_andnot(i, v)		atomic_and(~(int)(i), (v))
+#endif
+
 #ifndef atomic_fetch_andnot_relaxed
-#define atomic_fetch_andnot_relaxed	atomic_fetch_andnot
-#define atomic_fetch_andnot_acquire	atomic_fetch_andnot
-#define atomic_fetch_andnot_release	atomic_fetch_andnot
+
+#ifndef atomic_fetch_andnot
+#define atomic_fetch_andnot(i, v)		atomic_fetch_and(~(int)(i), (v))
+#define atomic_fetch_andnot_relaxed(i, v)	atomic_fetch_and_relaxed(~(int)(i), (v))
+#define atomic_fetch_andnot_acquire(i, v)	atomic_fetch_and_acquire(~(int)(i), (v))
+#define atomic_fetch_andnot_release(i, v)	atomic_fetch_and_release(~(int)(i), (v))
+#else /* atomic_fetch_andnot */
+#define atomic_fetch_andnot_relaxed		atomic_fetch_andnot
+#define atomic_fetch_andnot_acquire		atomic_fetch_andnot
+#define atomic_fetch_andnot_release		atomic_fetch_andnot
+#endif /* atomic_fetch_andnot */
 
 #else /* atomic_fetch_andnot_relaxed */
 
@@ -378,7 +388,6 @@
 	__atomic_op_fence(atomic_fetch_andnot, __VA_ARGS__)
 #endif
 #endif /* atomic_fetch_andnot_relaxed */
-#endif /* atomic_andnot */
 
 /* atomic_fetch_xor_relaxed */
 #ifndef atomic_fetch_xor_relaxed
@@ -655,33 +664,6 @@ static inline bool atomic_add_negative(int i, atomic_t *v)
 }
 #endif
 
-#ifndef atomic_andnot
-static inline void atomic_andnot(int i, atomic_t *v)
-{
-	atomic_and(~i, v);
-}
-
-static inline int atomic_fetch_andnot(int i, atomic_t *v)
-{
-	return atomic_fetch_and(~i, v);
-}
-
-static inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v)
-{
-	return atomic_fetch_and_relaxed(~i, v);
-}
-
-static inline int atomic_fetch_andnot_acquire(int i, atomic_t *v)
-{
-	return atomic_fetch_and_acquire(~i, v);
-}
-
-static inline int atomic_fetch_andnot_release(int i, atomic_t *v)
-{
-	return atomic_fetch_and_release(~i, v);
-}
-#endif
-
 #ifndef atomic_inc_unless_negative
 static inline bool atomic_inc_unless_negative(atomic_t *v)
 {
@@ -1029,12 +1011,22 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_fetch_and_relaxed */
 
-#ifdef atomic64_andnot
-/* atomic64_fetch_andnot_relaxed */
+#ifndef atomic64_andnot
+#define atomic64_andnot(i, v)		atomic64_and(~(long long)(i), (v))
+#endif
+
 #ifndef atomic64_fetch_andnot_relaxed
-#define atomic64_fetch_andnot_relaxed	atomic64_fetch_andnot
-#define atomic64_fetch_andnot_acquire	atomic64_fetch_andnot
-#define atomic64_fetch_andnot_release	atomic64_fetch_andnot
+
+#ifndef atomic64_fetch_andnot
+#define atomic64_fetch_andnot(i, v)		atomic64_fetch_and(~(long long)(i), (v))
+#define atomic64_fetch_andnot_relaxed(i, v)	atomic64_fetch_and_relaxed(~(long long)(i), (v))
+#define atomic64_fetch_andnot_acquire(i, v)	atomic64_fetch_and_acquire(~(long long)(i), (v))
+#define atomic64_fetch_andnot_release(i, v)	atomic64_fetch_and_release(~(long long)(i), (v))
+#else /* atomic64_fetch_andnot */
+#define atomic64_fetch_andnot_relaxed		atomic64_fetch_andnot
+#define atomic64_fetch_andnot_acquire		atomic64_fetch_andnot
+#define atomic64_fetch_andnot_release		atomic64_fetch_andnot
+#endif /* atomic64_fetch_andnot */
 
 #else /* atomic64_fetch_andnot_relaxed */
 
@@ -1053,7 +1045,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 	__atomic_op_fence(atomic64_fetch_andnot, __VA_ARGS__)
 #endif
 #endif /* atomic64_fetch_andnot_relaxed */
-#endif /* atomic64_andnot */
 
 /* atomic64_fetch_xor_relaxed */
 #ifndef atomic64_fetch_xor_relaxed
@@ -1262,33 +1253,6 @@ static inline bool atomic64_add_negative(long long i, atomic64_t *v)
 }
 #endif
 
-#ifndef atomic64_andnot
-static inline void atomic64_andnot(long long i, atomic64_t *v)
-{
-	atomic64_and(~i, v);
-}
-
-static inline long long atomic64_fetch_andnot(long long i, atomic64_t *v)
-{
-	return atomic64_fetch_and(~i, v);
-}
-
-static inline long long atomic64_fetch_andnot_relaxed(long long i, atomic64_t *v)
-{
-	return atomic64_fetch_and_relaxed(~i, v);
-}
-
-static inline long long atomic64_fetch_andnot_acquire(long long i, atomic64_t *v)
-{
-	return atomic64_fetch_and_acquire(~i, v);
-}
-
-static inline long long atomic64_fetch_andnot_release(long long i, atomic64_t *v)
-{
-	return atomic64_fetch_and_release(~i, v);
-}
-#endif
-
 #ifndef atomic64_inc_unless_negative
 static inline bool atomic64_inc_unless_negative(atomic64_t *v)
 {
-- 
cgit v1.2.3


From 75a040ff14d9a99fc041f5e1d8f09541cab13ba4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 1 Apr 2018 01:00:36 +0300
Subject: locking/refcounts: Include fewer headers in <linux/refcount.h>

Debloat <linux/refcount.h>'s dependencies:

- <linux/kernel.h> is not needed, but <linux/compiler.h> is.
- <linux/mutex.h> is not needed, only a forward declaration of "struct mutex".
- <linux/spinlock.h> is not needed, <linux/spinlock_types.h> is enough.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/lkml/20180331220036.GA7676@avx2
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/refcount.h | 1 +
 include/linux/refcount.h        | 7 ++++---
 lib/refcount.c                  | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
index 4cf11d88d3b3..19b90521954c 100644
--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -5,6 +5,7 @@
  * PaX/grsecurity.
  */
 #include <linux/refcount.h>
+#include <asm/bug.h>
 
 /*
  * This is the first portion of the refcount error handling, which lives in
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 4193c41e383a..c36addd27dd5 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -3,9 +3,10 @@
 #define _LINUX_REFCOUNT_H
 
 #include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/spinlock_types.h>
+
+struct mutex;
 
 /**
  * struct refcount_t - variant of atomic_t specialized for reference counts
diff --git a/lib/refcount.c b/lib/refcount.c
index 0eb48353abe3..4bd842f20749 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -35,7 +35,9 @@
  *
  */
 
+#include <linux/mutex.h>
 #include <linux/refcount.h>
+#include <linux/spinlock.h>
 #include <linux/bug.h>
 
 #ifdef CONFIG_REFCOUNT_FULL
-- 
cgit v1.2.3


From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 18 Jun 2018 12:52:50 +1000
Subject: rhashtable: split rhashtable.h

Due to the use of rhashtables in net namespaces,
rhashtable.h is included in lots of the kernel,
so a small changes can required a large recompilation.
This makes development painful.

This patch splits out rhashtable-types.h which just includes
the major type declarations, and does not include (non-trivial)
inline code.  rhashtable.h is no longer included by anything
in the include/ directory.
Common include files only include rhashtable-types.h so a large
recompilation is only triggered when that changes.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                                |   2 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   1 +
 include/linux/ipc.h                        |   2 +-
 include/linux/ipc_namespace.h              |   2 +-
 include/linux/mroute_base.h                |   2 +-
 include/linux/rhashtable-types.h           | 139 +++++++++++++++++++++++++++++
 include/linux/rhashtable.h                 | 127 +-------------------------
 include/net/inet_frag.h                    |   2 +-
 include/net/netfilter/nf_flow_table.h      |   2 +-
 include/net/sctp/structs.h                 |   2 +-
 include/net/seg6.h                         |   2 +-
 include/net/seg6_hmac.h                    |   2 +-
 ipc/msg.c                                  |   1 +
 ipc/sem.c                                  |   1 +
 ipc/shm.c                                  |   1 +
 ipc/util.c                                 |   1 +
 lib/rhashtable.c                           |   1 +
 net/ipv4/inet_fragment.c                   |   1 +
 net/ipv4/ipmr.c                            |   1 +
 net/ipv4/ipmr_base.c                       |   1 +
 net/ipv6/ip6mr.c                           |   1 +
 net/ipv6/seg6.c                            |   1 +
 net/ipv6/seg6_hmac.c                       |   1 +
 net/netfilter/nf_tables_api.c              |   1 +
 net/sctp/input.c                           |   1 +
 net/sctp/socket.c                          |   1 +
 26 files changed, 166 insertions(+), 133 deletions(-)
 create mode 100644 include/linux/rhashtable-types.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index edf3cf5ea691..99e5cef8172e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12162,7 +12162,9 @@ M:	Herbert Xu <herbert@gondor.apana.org.au>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	lib/rhashtable.c
+F:	lib/test_rhashtable.c
 F:	include/linux/rhashtable.h
+F:	include/linux/rhashtable-types.h
 
 RICOH R5C592 MEMORYSTICK DRIVER
 M:	Maxim Levitsky <maximlevitsky@gmail.com>
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 0dbe2d9e22d6..1adb968b8354 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -46,6 +46,7 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
 #include <linux/etherdevice.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 6cc2df7f7ac9..e1c9eea6015b 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -4,7 +4,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/uidgid.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <uapi/linux/ipc.h>
 #include <linux/refcount.h>
 
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b5630c8eb2f3..6cea726612b7 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -9,7 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 struct user_namespace;
 
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index d633f737b3c6..fd436cdd4725 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -2,7 +2,7 @@
 #define __LINUX_MROUTE_BASE_H
 
 #include <linux/netdevice.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644
index 000000000000..9740063ff13b
--- /dev/null
+++ b/include/linux/rhashtable-types.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+	struct rhash_head __rcu		*next;
+};
+
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+			       const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @nulls_base: Base value to generate nulls marker
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+	u16			nelem_hint;
+	u16			key_len;
+	u16			key_offset;
+	u16			head_offset;
+	unsigned int		max_size;
+	u16			min_size;
+	bool			automatic_shrinking;
+	u8			locks_mul;
+	u32			nulls_base;
+	rht_hashfn_t		hashfn;
+	rht_obj_hashfn_t	obj_hashfn;
+	rht_obj_cmpfn_t		obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+	struct bucket_table __rcu	*tbl;
+	unsigned int			key_len;
+	unsigned int			max_elems;
+	struct rhashtable_params	p;
+	bool				rhlist;
+	struct work_struct		run_work;
+	struct mutex                    mutex;
+	spinlock_t			lock;
+	atomic_t			nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+	struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhlist_head *list;
+	struct rhashtable_walker walker;
+	unsigned int slot;
+	unsigned int skip;
+	bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+		  const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4e1f535c2034..48754ab07cdf 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -17,16 +18,14 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
-#include <linux/mutex.h>
 #include <linux/rculist.h>
 
+#include <linux/rhashtable-types.h>
 /*
  * The end of the chain is marked with a special nulls marks which has
  * the following format:
@@ -64,15 +63,6 @@
  */
 #define RHT_ELASTICITY	16u
 
-struct rhash_head {
-	struct rhash_head __rcu		*next;
-};
-
-struct rhlist_head {
-	struct rhash_head		rhead;
-	struct rhlist_head __rcu	*next;
-};
-
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -102,114 +92,6 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-/**
- * struct rhashtable_compare_arg - Key for the function rhashtable_compare
- * @ht: Hash table
- * @key: Key to compare against
- */
-struct rhashtable_compare_arg {
-	struct rhashtable *ht;
-	const void *key;
-};
-
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-			       const void *obj);
-
-struct rhashtable;
-
-/**
- * struct rhashtable_params - Hash table construction parameters
- * @nelem_hint: Hint on number of elements, should be 75% of desired size
- * @key_len: Length of key
- * @key_offset: Offset of key in struct to be hashed
- * @head_offset: Offset of rhash_head in struct to be hashed
- * @max_size: Maximum size while expanding
- * @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
- * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
- * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
- * @obj_hashfn: Function to hash object
- * @obj_cmpfn: Function to compare key with object
- */
-struct rhashtable_params {
-	u16			nelem_hint;
-	u16			key_len;
-	u16			key_offset;
-	u16			head_offset;
-	unsigned int		max_size;
-	u16			min_size;
-	bool			automatic_shrinking;
-	u8			locks_mul;
-	u32			nulls_base;
-	rht_hashfn_t		hashfn;
-	rht_obj_hashfn_t	obj_hashfn;
-	rht_obj_cmpfn_t		obj_cmpfn;
-};
-
-/**
- * struct rhashtable - Hash table handle
- * @tbl: Bucket table
- * @key_len: Key length for hashfn
- * @max_elems: Maximum number of elements in table
- * @p: Configuration parameters
- * @rhlist: True if this is an rhltable
- * @run_work: Deferred worker to expand/shrink asynchronously
- * @mutex: Mutex to protect current/future table swapping
- * @lock: Spin lock to protect walker list
- * @nelems: Number of elements in table
- */
-struct rhashtable {
-	struct bucket_table __rcu	*tbl;
-	unsigned int			key_len;
-	unsigned int			max_elems;
-	struct rhashtable_params	p;
-	bool				rhlist;
-	struct work_struct		run_work;
-	struct mutex                    mutex;
-	spinlock_t			lock;
-	atomic_t			nelems;
-};
-
-/**
- * struct rhltable - Hash table with duplicate objects in a list
- * @ht: Underlying rhtable
- */
-struct rhltable {
-	struct rhashtable ht;
-};
-
-/**
- * struct rhashtable_walker - Hash table walker
- * @list: List entry on list of walkers
- * @tbl: The table that we were walking over
- */
-struct rhashtable_walker {
-	struct list_head list;
-	struct bucket_table *tbl;
-};
-
-/**
- * struct rhashtable_iter - Hash table iterator
- * @ht: Table to iterate through
- * @p: Current pointer
- * @list: Current hash list pointer
- * @walker: Associated rhashtable walker
- * @slot: Current slot
- * @skip: Number of entries to skip in slot
- */
-struct rhashtable_iter {
-	struct rhashtable *ht;
-	struct rhash_head *p;
-	struct rhlist_head *list;
-	struct rhashtable_walker walker;
-	unsigned int slot;
-	unsigned int skip;
-	bool end_of_table;
-};
-
 static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
 {
 	return NULLS_MARKER(ht->p.nulls_base + hash);
@@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
 }
 #endif /* CONFIG_PROVE_LOCKING */
 
-int rhashtable_init(struct rhashtable *ht,
-		    const struct rhashtable_params *params);
-int rhltable_init(struct rhltable *hlt,
-		  const struct rhashtable_params *params);
-
 void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 			     struct rhash_head *obj);
 
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..f4272a29dc44 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,7 +2,7 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 struct netns_frags {
 	/* sysctls */
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ba9fa4592f2b..0e355f4a3d76 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -4,7 +4,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/netdevice.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/rcupdate.h>
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/dst.h>
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index dbe1b911a24d..e0f962d27386 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -48,7 +48,7 @@
 #define __sctp_structs_h__
 
 #include <linux/ktime.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/socket.h>	/* linux/in.h needs this!!    */
 #include <linux/in.h>		/* We get struct sockaddr_in. */
 #include <linux/in6.h>		/* We get struct in6_addr     */
diff --git a/include/net/seg6.h b/include/net/seg6.h
index e029e301faa5..2567941a2f32 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -18,7 +18,7 @@
 #include <linux/ipv6.h>
 #include <net/lwtunnel.h>
 #include <linux/seg6.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
index 69c3a106056b..7fda469e2758 100644
--- a/include/net/seg6_hmac.h
+++ b/include/net/seg6_hmac.h
@@ -22,7 +22,7 @@
 #include <linux/route.h>
 #include <net/seg6.h>
 #include <linux/seg6_hmac.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 #define SEG6_HMAC_MAX_DIGESTSIZE	160
 #define SEG6_HMAC_RING_SIZE		256
diff --git a/ipc/msg.c b/ipc/msg.c
index 3b6545302598..203281198079 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -38,6 +38,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index 5af1943ad782..29c0347ef11d 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -86,6 +86,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/sched/wake_q.h>
 #include <linux/nospec.h>
+#include <linux/rhashtable.h>
 
 #include <linux/uaccess.h>
 #include "util.h"
diff --git a/ipc/shm.c b/ipc/shm.c
index 051a3e1fb8df..d4daf78df6da 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -43,6 +43,7 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <linux/uaccess.h>
 
diff --git a/ipc/util.c b/ipc/util.c
index 4e81182fa0ac..fdffff41f65b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -63,6 +63,7 @@
 #include <linux/rwsem.h>
 #include <linux/memory.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <asm/unistd.h>
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9427b5766134..c9fafea7dc6e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -28,6 +28,7 @@
 #include <linux/rhashtable.h>
 #include <linux/err.h>
 #include <linux/export.h>
+#include <linux/rhashtable.h>
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..316518f87294 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <net/sock.h>
 #include <net/inet_frag.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9f79b9803a16..82f914122f1b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
 #include <linux/export.h>
+#include <linux/rhashtable.h>
 #include <net/ip_tunnels.h>
 #include <net/checksum.h>
 #include <net/netlink.h>
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index cafb0506c8c9..1ad9aa62a97b 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -2,6 +2,7 @@
  * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
  */
 
+#include <linux/rhashtable.h>
 #include <linux/mroute_base.h>
 
 /* Sets everything common except 'dev', since that is done under locking */
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0d0f0053bb11..d0b7e0249c13 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -32,6 +32,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/compat.h>
+#include <linux/rhashtable.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <net/raw.h>
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 0fdf2a55e746..8d0ba757a46c 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -17,6 +17,7 @@
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <net/ipv6.h>
 #include <net/protocol.h>
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac1..b1791129a875 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -22,6 +22,7 @@
 #include <linux/icmpv6.h>
 #include <linux/mroute6.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 896d4a36081d..3f211e1025c1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ba8a6e6c36fa..9bbc5f92c941 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -56,6 +56,7 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/checksum.h>
 #include <net/net_namespace.h>
+#include <linux/rhashtable.h>
 
 /* Forward declarations for internal helpers. */
 static int sctp_rcv_ootb(struct sk_buff *);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d20f7addee19..0e91e83eea5a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -66,6 +66,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/compat.h>
+#include <linux/rhashtable.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>
-- 
cgit v1.2.3


From 9f9a707738aa7a8b9f78a641b83927ada256a626 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 18 Jun 2018 12:52:50 +1000
Subject: rhashtable: remove nulls_base and related code.

This "feature" is unused, undocumented, and untested and so doesn't
really belong.  A patch is under development to properly implement
support for detecting when a search gets diverted down a different
chain, which the common purpose of nulls markers.

This patch actually fixes a bug too.  The table resizing allows a
table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
any growth beyond 2^27 is wasteful an ineffective.

This patch results in NULLS_MARKER(0) being used for all chains,
and leaves the use of rht_is_a_null() to test for it.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable-types.h |  2 --
 include/linux/rhashtable.h       | 33 +++------------------------------
 lib/rhashtable.c                 |  8 --------
 lib/test_rhashtable.c            |  5 +----
 net/core/xdp.c                   |  4 ++--
 5 files changed, 6 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 9740063ff13b..763d613ce2c2 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -50,7 +50,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @min_size: Minimum size while shrinking
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
@@ -64,7 +63,6 @@ struct rhashtable_params {
 	u16			min_size;
 	bool			automatic_shrinking;
 	u8			locks_mul;
-	u32			nulls_base;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 48754ab07cdf..d9f719af7936 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -28,25 +28,8 @@
 #include <linux/rhashtable-types.h>
 /*
  * The end of the chain is marked with a special nulls marks which has
- * the following format:
- *
- * +-------+-----------------------------------------------------+-+
- * | Base  |                      Hash                           |1|
- * +-------+-----------------------------------------------------+-+
- *
- * Base (4 bits) : Reserved to distinguish between multiple tables.
- *                 Specified via &struct rhashtable_params.nulls_base.
- * Hash (27 bits): Full hash (unmasked) of first element added to bucket
- * 1 (1 bit)     : Nulls marker (always set)
- *
- * The remaining bits of the next pointer remain unused for now.
+ * the least significant bit set.
  */
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-
-/* Base bits plus 1 bit for nulls marker */
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
 /* Maximum chain length before rehash
  *
@@ -92,24 +75,14 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
 #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
 	return ((unsigned long) ptr & 1);
 }
 
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
-	return ((unsigned long) ptr) >> 1;
-}
-
 static inline void *rht_obj(const struct rhashtable *ht,
 			    const struct rhash_head *he)
 {
@@ -119,7 +92,7 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
 static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c9fafea7dc6e..688693c919be 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -995,7 +995,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -1029,9 +1028,6 @@ int rhashtable_init(struct rhashtable *ht,
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -1096,10 +1092,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
 	int err;
 
-	/* No rhlist NULLs marking for now. */
-	if (params->nulls_base)
-		return -EINVAL;
-
 	err = rhashtable_init(&hlt->ht, params);
 	hlt->ht.rhlist = true;
 	return err;
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 6ca59ffcacbe..82ac39ce5310 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed)
 {
 	const struct test_obj_rhl *obj = data;
 
-	return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+	return (obj->value.id % 10);
 }
 
 static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
@@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = {
 	.key_offset = offsetof(struct test_obj, value),
 	.key_len = sizeof(struct test_obj_val),
 	.hashfn = jhash,
-	.nulls_base = (3U << RHT_BASE_SHIFT),
 };
 
 static struct rhashtable_params test_rht_params_dup = {
@@ -296,8 +295,6 @@ static int __init test_rhltable(unsigned int entries)
 	if (!obj_in_table)
 		goto out_free;
 
-	/* nulls_base not supported in rhlist interface */
-	test_rht_params.nulls_base = 0;
 	err = rhltable_init(&rhlt, &test_rht_params);
 	if (WARN_ON(err))
 		goto out_free;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9d1f22072d5d..31c58719b5a9 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -45,8 +45,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
 	BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
 		     != sizeof(u32));
 
-	/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
-	return key << RHT_HASH_RESERVED_SPACE;
+	/* Use cyclic increasing ID as direct hash key */
+	return key;
 }
 
 static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
-- 
cgit v1.2.3


From 9b4f64a227b6f462482a8cc68c7134dc6e26f1c1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 18 Jun 2018 12:52:50 +1000
Subject: rhashtable: simplify INIT_RHT_NULLS_HEAD()

The 'ht' and 'hash' arguments to INIT_RHT_NULLS_HEAD() are
no longer used - so drop them.  This allows us to also
remove the nhash argument from nested_table_alloc().

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 +-
 lib/rhashtable.c           | 15 ++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index d9f719af7936..3f3a182bd0b4 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -75,7 +75,7 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
+#define INIT_RHT_NULLS_HEAD(ptr)	\
 	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 688693c919be..a81cd27d518c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -116,8 +116,7 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 
 static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					      union nested_table __rcu **prev,
-					      unsigned int shifted,
-					      unsigned int nhash)
+					      unsigned int shifted)
 {
 	union nested_table *ntbl;
 	int i;
@@ -130,8 +129,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 
 	if (ntbl && shifted) {
 		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
-			INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht,
-					    (i << shifted) | nhash);
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
 	}
 
 	rcu_assign_pointer(*prev, ntbl);
@@ -157,7 +155,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 
 	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
-				0, 0)) {
+				0)) {
 		kfree(tbl);
 		return NULL;
 	}
@@ -207,7 +205,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	tbl->hash_rnd = get_random_u32();
 
 	for (i = 0; i < nbuckets; i++)
-		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 
 	return tbl;
 }
@@ -1217,7 +1215,7 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 	nhash = index;
 	shifted = tbl->nest;
 	ntbl = nested_table_alloc(ht, &ntbl[index].table,
-				  size <= (1 << shift) ? shifted : 0, nhash);
+				  size <= (1 << shift) ? shifted : 0);
 
 	while (ntbl && size > (1 << shift)) {
 		index = hash & ((1 << shift) - 1);
@@ -1226,8 +1224,7 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 		nhash |= index << shifted;
 		shifted += shift;
 		ntbl = nested_table_alloc(ht, &ntbl[index].table,
-					  size <= (1 << shift) ? shifted : 0,
-					  nhash);
+					  size <= (1 << shift) ? shifted : 0);
 	}
 
 	if (!ntbl)
-- 
cgit v1.2.3


From c0690016a73fe6bd456887bbbe6e10c7f0096554 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 18 Jun 2018 12:52:50 +1000
Subject: rhashtable: clean up dereference of ->future_tbl.

Using rht_dereference_bucket() to dereference
->future_tbl looks like a type error, and could be confusing.
Using rht_dereference_rcu() to test a pointer for NULL
adds an unnecessary barrier - rcu_access_pointer() is preferred
for NULL tests when no lock is held.

This uses 3 different ways to access ->future_tbl.
- if we know the mutex is held, use rht_dereference()
- if we don't hold the mutex, and are only testing for NULL,
  use rcu_access_pointer()
- otherwise (using RCU protection for true dereference),
  use rht_dereference_rcu().

Note that this includes a simplification of the call to
rhashtable_last_table() - we don't do an extra dereference
before the call any more.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 2 +-
 lib/rhashtable.c           | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 3f3a182bd0b4..eb7111039247 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -595,7 +595,7 @@ static inline void *__rhashtable_insert_fast(
 	lock = rht_bucket_lock(tbl, hash);
 	spin_lock_bh(lock);
 
-	if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) {
+	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
 		spin_unlock_bh(lock);
 		rcu_read_unlock();
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 52ec83212856..0e04947b7e0c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -226,8 +226,7 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct bucket_table *new_tbl = rhashtable_last_table(ht,
-		rht_dereference_rcu(old_tbl->future_tbl, ht));
+	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
 	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
@@ -467,7 +466,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
 	/* Do not fail the insert if someone else did a rehash. */
-	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+	if (likely(rcu_access_pointer(tbl->future_tbl)))
 		return 0;
 
 	/* Schedule async rehash to retry allocation in process context. */
@@ -540,7 +539,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
 		return ERR_CAST(data);
 
-	new_tbl = rcu_dereference(tbl->future_tbl);
+	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (new_tbl)
 		return new_tbl;
 
@@ -599,7 +598,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			break;
 
 		spin_unlock_bh(lock);
-		tbl = rcu_dereference(tbl->future_tbl);
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	}
 
 	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-- 
cgit v1.2.3


From 3f6c65d6255a872846c44182c82c78d3dc6239f5 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 19 Jun 2018 21:42:50 -0700
Subject: tcp: ignore rcv_rtt sample with old ts ecr value

When receiving multiple packets with the same ts ecr value, only try
to compute rcv_rtt sample with the earliest received packet.
This is because the rcv_rtt calculated by later received packets
could possibly include long idle time or other types of delay.
For example:
(1) server sends last packet of reply with TS val V1
(2) client ACKs last packet of reply with TS ecr V1
(3) long idle time passes
(4) client sends next request data packet with TS ecr V1 (again!)
At this time, the rcv_rtt computed on server with TS ecr V1 will be
inflated with the idle time and should get ignored.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  1 +
 net/ipv4/tcp.c       |  1 +
 net/ipv4/tcp_input.c | 14 +++++++++++---
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..3dbea6610304 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -350,6 +350,7 @@ struct tcp_sock {
 #endif
 
 /* Receiver side RTT estimation */
+	u32 rcv_rtt_last_tsecr;
 	struct {
 		u32	rtt_us;
 		u32	seq;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 141acd92e58a..47c45d5be9f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2563,6 +2563,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	sk->sk_shutdown = 0;
 	sock_reset_flag(sk, SOCK_DONE);
 	tp->srtt_us = 0;
+	tp->rcv_rtt_last_tsecr = 0;
 	tp->write_seq += tp->max_window + 2;
 	if (tp->write_seq == 0)
 		tp->write_seq = 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 355d3dffd021..76ca88f63b70 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -582,9 +582,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->rx_opt.rcv_tsecr &&
-	    (TCP_SKB_CB(skb)->end_seq -
-	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
+	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
+		return;
+	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+
+	if (TCP_SKB_CB(skb)->end_seq -
+	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
 		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
 		u32 delta_us;
 
@@ -5475,6 +5478,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 				tcp_ack(sk, skb, 0);
 				__kfree_skb(skb);
 				tcp_data_snd_check(sk);
+				/* When receiving pure ack in fast path, update
+				 * last ts ecr directly instead of calling
+				 * tcp_rcv_rtt_measure_ts()
+				 */
+				tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
 				return;
 			} else { /* Header too small */
 				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
-- 
cgit v1.2.3


From 63a67a926e214dac94e29147c0f3d11499f655a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Jun 2018 17:16:44 -0400
Subject: kill dentry_update_name_case()

the last user is gone

Spotted-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 27 ---------------------------
 include/linux/dcache.h |  2 --
 2 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0e8e5de3c48a..d9323af88ed0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2676,33 +2676,6 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
 }
 EXPORT_SYMBOL(d_exact_alias);
 
-/**
- * dentry_update_name_case - update case insensitive dentry with a new name
- * @dentry: dentry to be updated
- * @name: new name
- *
- * Update a case insensitive dentry with new case of name.
- *
- * dentry must have been returned by d_lookup with name @name. Old and new
- * name lengths must match (ie. no d_compare which allows mismatched name
- * lengths).
- *
- * Parent inode i_mutex must be held over d_lookup and into this call (to
- * keep renames and concurrent inserts, and readdir(2) away).
- */
-void dentry_update_name_case(struct dentry *dentry, const struct qstr *name)
-{
-	BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
-	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
-
-	spin_lock(&dentry->d_lock);
-	write_seqcount_begin(&dentry->d_seq);
-	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
-	write_seqcount_end(&dentry->d_seq);
-	spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(dentry_update_name_case);
-
 static void swap_names(struct dentry *dentry, struct dentry *target)
 {
 	if (unlikely(dname_external(target))) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 66c6e17e61e5..cee70bf207fc 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -271,8 +271,6 @@ extern void d_rehash(struct dentry *);
  
 extern void d_add(struct dentry *, struct inode *);
 
-extern void dentry_update_name_case(struct dentry *, const struct qstr *);
-
 /* used for rename() and baskets */
 extern void d_move(struct dentry *, struct dentry *);
 extern void d_exchange(struct dentry *, struct dentry *);
-- 
cgit v1.2.3


From d0dd63a8aee1ef89f2e48e554b796b9f9e4fcadb Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 16 Jun 2018 22:11:42 -0700
Subject: time: Introduce struct __kernel_itimerspec

struct itimerspec is not y2038-safe.

Introduce a new struct __kernel_itimerspec based on the kernel internal
y2038-safe struct itimerspec64.

The definition of struct __kernel_itimerspec includes two struct
__kernel_timespec.

Since struct __kernel_timespec has the same representation in native and
compat modes, so does struct __kernel_itimerspec. This helps have a common
entry point for syscalls using struct __kernel_itimerspec.

New y2038-safe syscalls will use this new type. Since most of the new
syscalls are just an update to the native syscalls with the type update,
place the new definition under CONFIG_64BIT_TIME. This helps architectures
that do not support the above config to keep using the old definition of
struct itimerspec.

Also change the get/put_itimerspec64 to use struct__kernel_itimerspec.
This will help 32 bit architectures to use the new syscalls when
architectures select CONFIG_64BIT_TIME.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Cc: y2038@lists.linaro.org
Link: https://lkml.kernel.org/r/20180617051144.29756-2-deepa.kernel@gmail.com
---
 include/linux/time.h      | 4 ++--
 include/linux/time64.h    | 1 +
 include/uapi/linux/time.h | 7 +++++++
 kernel/time/time.c        | 4 ++--
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index aed74463592d..27d83fd2ae61 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -14,9 +14,9 @@ int get_timespec64(struct timespec64 *ts,
 int put_timespec64(const struct timespec64 *ts,
 		struct __kernel_timespec __user *uts);
 int get_itimerspec64(struct itimerspec64 *it,
-			const struct itimerspec __user *uit);
+			const struct __kernel_itimerspec __user *uit);
 int put_itimerspec64(const struct itimerspec64 *it,
-			struct itimerspec __user *uit);
+			struct __kernel_itimerspec __user *uit);
 
 extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 0a7b2f79cec7..05634afba0db 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -12,6 +12,7 @@ typedef __u64 timeu64_t;
  */
 #ifndef CONFIG_64BIT_TIME
 #define __kernel_timespec timespec
+#define __kernel_itimerspec itimerspec
 #endif
 
 #include <uapi/linux/time.h>
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index fcf936656493..6b56a2208be7 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -49,6 +49,13 @@ struct __kernel_timespec {
 };
 #endif
 
+#ifndef __kernel_itimerspec
+struct __kernel_itimerspec {
+	struct __kernel_timespec it_interval;    /* timer period */
+	struct __kernel_timespec it_value;       /* timer expiration */
+};
+#endif
+
 /*
  * legacy timeval structure, only embedded in structures that
  * traditionally used 'timeval' to pass time intervals (not absolute
diff --git a/kernel/time/time.c b/kernel/time/time.c
index b1225db61eb2..c0195225fdce 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -927,7 +927,7 @@ int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
 EXPORT_SYMBOL_GPL(compat_put_timespec64);
 
 int get_itimerspec64(struct itimerspec64 *it,
-			const struct itimerspec __user *uit)
+			const struct __kernel_itimerspec __user *uit)
 {
 	int ret;
 
@@ -942,7 +942,7 @@ int get_itimerspec64(struct itimerspec64 *it,
 EXPORT_SYMBOL_GPL(get_itimerspec64);
 
 int put_itimerspec64(const struct itimerspec64 *it,
-			struct itimerspec __user *uit)
+			struct __kernel_itimerspec __user *uit)
 {
 	int ret;
 
-- 
cgit v1.2.3


From afef05cf238cfcecdc5ea9b06f31027b13ce6214 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 16 Jun 2018 22:11:43 -0700
Subject: time: Enable get/put_compat_itimerspec64 always

This will aid in enabling the compat syscalls on 32-bit architectures later
on.

Also move compat_itimerspec and related defines to compat_time.h.  The
compat_time.h file will eventually be deleted.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Cc: y2038@lists.linaro.org
Link: https://lkml.kernel.org/r/20180617051144.29756-3-deepa.kernel@gmail.com
---
 include/linux/compat.h      |  9 ---------
 include/linux/compat_time.h |  9 +++++++++
 kernel/compat.c             | 29 -----------------------------
 kernel/time/time.c          | 21 +++++++++++++++++++++
 4 files changed, 30 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index b1a5562b3215..18a13f2df14b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -109,11 +109,6 @@ typedef	compat_ulong_t		compat_aio_context_t;
 struct compat_sel_arg_struct;
 struct rusage;
 
-struct compat_itimerspec {
-	struct compat_timespec it_interval;
-	struct compat_timespec it_value;
-};
-
 struct compat_utimbuf {
 	compat_time_t		actime;
 	compat_time_t		modtime;
@@ -294,10 +289,6 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
-extern int get_compat_itimerspec64(struct itimerspec64 *its,
-			const struct compat_itimerspec __user *uits);
-extern int put_compat_itimerspec64(const struct itimerspec64 *its,
-			struct compat_itimerspec __user *uits);
 
 struct compat_iovec {
 	compat_uptr_t	iov_base;
diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h
index 31f2774f1994..e70bfd1d2c3f 100644
--- a/include/linux/compat_time.h
+++ b/include/linux/compat_time.h
@@ -17,7 +17,16 @@ struct compat_timeval {
 	s32		tv_usec;
 };
 
+struct compat_itimerspec {
+	struct compat_timespec it_interval;
+	struct compat_timespec it_value;
+};
+
 extern int compat_get_timespec64(struct timespec64 *, const void __user *);
 extern int compat_put_timespec64(const struct timespec64 *, void __user *);
+extern int get_compat_itimerspec64(struct itimerspec64 *its,
+			const struct compat_itimerspec __user *uits);
+extern int put_compat_itimerspec64(const struct itimerspec64 *its,
+			struct compat_itimerspec __user *uits);
 
 #endif /* _LINUX_COMPAT_TIME_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 702aa846ddac..8e40efc2928a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -324,35 +324,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
-/* Todo: Delete these extern declarations when get/put_compat_itimerspec64()
- * are moved to kernel/time/time.c .
- */
-extern int __compat_get_timespec64(struct timespec64 *ts64,
-				   const struct compat_timespec __user *cts);
-extern int __compat_put_timespec64(const struct timespec64 *ts64,
-				   struct compat_timespec __user *cts);
-
-int get_compat_itimerspec64(struct itimerspec64 *its,
-			const struct compat_itimerspec __user *uits)
-{
-
-	if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
-	    __compat_get_timespec64(&its->it_value, &uits->it_value))
-		return -EFAULT;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
-
-int put_compat_itimerspec64(const struct itimerspec64 *its,
-			struct compat_itimerspec __user *uits)
-{
-	if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
-	    __compat_put_timespec64(&its->it_value, &uits->it_value))
-		return -EFAULT;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
-
 /*
  * We currently only need the following fields from the sigevent
  * structure: sigev_value, sigev_signo, sig_notify and (sometimes
diff --git a/kernel/time/time.c b/kernel/time/time.c
index c0195225fdce..72caf051901c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -955,3 +955,24 @@ int put_itimerspec64(const struct itimerspec64 *it,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(put_itimerspec64);
+
+int get_compat_itimerspec64(struct itimerspec64 *its,
+			const struct compat_itimerspec __user *uits)
+{
+
+	if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
+	    __compat_get_timespec64(&its->it_value, &uits->it_value))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
+
+int put_compat_itimerspec64(const struct itimerspec64 *its,
+			struct compat_itimerspec __user *uits)
+{
+	if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
+	    __compat_put_timespec64(&its->it_value, &uits->it_value))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
-- 
cgit v1.2.3


From 6ff84735070276d72af716e21c3214ee20d60e70 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 16 Jun 2018 22:11:44 -0700
Subject: time: Change types to new y2038 safe __kernel_itimerspec

timer_set/gettime and timerfd_set/get apis use struct itimerspec at the
user interface layer.  struct itimerspec is not y2038-safe.  Change these
interfaces to use y2038-safe struct __kernel_itimerspec instead.  This will
help define new syscalls when 32bit architectures select CONFIG_64BIT_TIME.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: viro@zeniv.linux.org.uk
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Cc: y2038@lists.linaro.org
Link: https://lkml.kernel.org/r/20180617051144.29756-4-deepa.kernel@gmail.com
---
 fs/timerfd.c               |  8 ++++----
 include/linux/syscalls.h   | 10 +++++-----
 kernel/time/posix-timers.c | 12 +++++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index d84a2bee4f82..8bb926253f88 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -533,8 +533,8 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
 }
 
 SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
-		const struct itimerspec __user *, utmr,
-		struct itimerspec __user *, otmr)
+		const struct __kernel_itimerspec __user *, utmr,
+		struct __kernel_itimerspec __user *, otmr)
 {
 	struct itimerspec64 new, old;
 	int ret;
@@ -550,7 +550,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	return ret;
 }
 
-SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, otmr)
 {
 	struct itimerspec64 kotmr;
 	int ret = do_timerfd_gettime(ufd, &kotmr);
@@ -559,7 +559,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 	return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		const struct compat_itimerspec __user *, utmr,
 		struct compat_itimerspec __user *, otmr)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 73810808cdf2..38b9ec152024 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -501,9 +501,9 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 /* fs/timerfd.c */
 asmlinkage long sys_timerfd_create(int clockid, int flags);
 asmlinkage long sys_timerfd_settime(int ufd, int flags,
-				    const struct itimerspec __user *utmr,
-				    struct itimerspec __user *otmr);
-asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
+				    const struct __kernel_itimerspec __user *utmr,
+				    struct __kernel_itimerspec __user *otmr);
+asmlinkage long sys_timerfd_gettime(int ufd, struct __kernel_itimerspec __user *otmr);
 
 /* fs/utimes.c */
 asmlinkage long sys_utimensat(int dfd, const char __user *filename,
@@ -568,10 +568,10 @@ asmlinkage long sys_timer_create(clockid_t which_clock,
 				 struct sigevent __user *timer_event_spec,
 				 timer_t __user * created_timer_id);
 asmlinkage long sys_timer_gettime(timer_t timer_id,
-				struct itimerspec __user *setting);
+				struct __kernel_itimerspec __user *setting);
 asmlinkage long sys_timer_getoverrun(timer_t timer_id);
 asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
-				const struct itimerspec __user *new_setting,
+				const struct __kernel_itimerspec __user *new_setting,
 				struct itimerspec __user *old_setting);
 asmlinkage long sys_timer_delete(timer_t timer_id);
 asmlinkage long sys_clock_settime(clockid_t which_clock,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index fcf90a10c43a..80d59333c76e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -743,7 +743,7 @@ static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
 
 /* Get the time remaining on a POSIX.1b interval timer. */
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
-		struct itimerspec __user *, setting)
+		struct __kernel_itimerspec __user *, setting)
 {
 	struct itimerspec64 cur_setting;
 
@@ -755,7 +755,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
 COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		       struct compat_itimerspec __user *, setting)
 {
@@ -768,6 +769,7 @@ COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	}
 	return ret;
 }
+
 #endif
 
 /*
@@ -906,8 +908,8 @@ retry:
 
 /* Set a POSIX.1b interval timer */
 SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
-		const struct itimerspec __user *, new_setting,
-		struct itimerspec __user *, old_setting)
+		const struct __kernel_itimerspec __user *, new_setting,
+		struct __kernel_itimerspec __user *, old_setting)
 {
 	struct itimerspec64 new_spec, old_spec;
 	struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
@@ -927,7 +929,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	return error;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 		       struct compat_itimerspec __user *, new,
 		       struct compat_itimerspec __user *, old)
-- 
cgit v1.2.3


From faef87723ace12e5f685437c1e68cbecb69a7a89 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 15 Jun 2018 13:08:51 +0200
Subject: dma-noncoherent: add a arch_sync_dma_for_cpu_all hook

The MIPS bmips platform needs a global flush when transferring ownership
back to the CPU.  Add a hook for that to the dma-noncoherent
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Patchwork: https://patchwork.linux-mips.org/patch/19549/
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Tom Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: iommu@lists.linux-foundation.org
Cc: linux-mips@linux-mips.org
---
 include/linux/dma-noncoherent.h | 8 ++++++++
 kernel/dma/noncoherent.c        | 8 ++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 10b2654d549b..a0aa00cc909d 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -44,4 +44,12 @@ static inline void arch_sync_dma_for_cpu(struct device *dev,
 }
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
+void arch_sync_dma_for_cpu_all(struct device *dev);
+#else
+static inline void arch_sync_dma_for_cpu_all(struct device *dev)
+{
+}
+#endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */
+
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c
index 79e9a757387f..031fe235d958 100644
--- a/kernel/dma/noncoherent.c
+++ b/kernel/dma/noncoherent.c
@@ -49,11 +49,13 @@ static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
 	return nents;
 }
 
-#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
 	arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+	arch_sync_dma_for_cpu_all(dev);
 }
 
 static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
@@ -64,6 +66,7 @@ static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
 
 	for_each_sg(sgl, sg, nents, i)
 		arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+	arch_sync_dma_for_cpu_all(dev);
 }
 
 static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
@@ -89,7 +92,8 @@ const struct dma_map_ops dma_noncoherent_ops = {
 	.sync_sg_for_device	= dma_noncoherent_sync_sg_for_device,
 	.map_page		= dma_noncoherent_map_page,
 	.map_sg			= dma_noncoherent_map_sg,
-#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	.sync_single_for_cpu	= dma_noncoherent_sync_single_for_cpu,
 	.sync_sg_for_cpu	= dma_noncoherent_sync_sg_for_cpu,
 	.unmap_page		= dma_noncoherent_unmap_page,
-- 
cgit v1.2.3


From fd55a281adf6c9b9723d369203fdd92e5ea62a09 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 27 Apr 2018 10:06:03 -0700
Subject: srcu: Introduce srcu_read_{un,}lock_notrace()

Joel Fernandes is using SRCU to protect from-idle tracepoints, which
requires notrace variants of srcu_read_lock() and srcu_read_unlock()
in order to avoid problems with tracepoints in lockdep.  This commit
therefore adds srcu_read_lock_notrace() and srcu_read_unlock_notrace().

[1] http://lkml.kernel.org/r/20180427042656.190746-1-joelaf@google.com

Reported-by: Joel Fernandes <joelaf@google.com>
Intermittently-reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 91494d7e8e41..3e72a291c401 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -195,6 +195,16 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 	return retval;
 }
 
+/* Used by tracing, cannot be traced and cannot invoke lockdep. */
+static inline notrace int
+srcu_read_lock_notrace(struct srcu_struct *sp) __acquires(sp)
+{
+	int retval;
+
+	retval = __srcu_read_lock(sp);
+	return retval;
+}
+
 /**
  * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
  * @sp: srcu_struct in which to unregister the old reader.
@@ -209,6 +219,13 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	__srcu_read_unlock(sp, idx);
 }
 
+/* Used by tracing, cannot be traced and cannot call lockdep. */
+static inline notrace void
+srcu_read_unlock_notrace(struct srcu_struct *sp, int idx) __releases(sp)
+{
+	__srcu_read_unlock(sp, idx);
+}
+
 /**
  * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
  *
-- 
cgit v1.2.3


From 90127d605f403d814f4986436871210bf8ceb335 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 9 May 2018 10:29:18 -0700
Subject: torture: Make online/offline messages appear only for verbose=2

Some bugs reproduce quickly only at high CPU-hotplug rates, so the
rcutorture TREE03 scenario now has only 200 milliseconds spacing between
CPU-hotplug operations.  At this rate, the torture-test pair of console
messages per operation becomes a bit voluminous.  This commit therefore
converts the torture-test set of "verbose" kernel-boot arguments from
bool to int, and prints the extra console messages only when verbose=2.
The default is still verbose=1.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h      |  2 +-
 kernel/locking/locktorture.c |  2 +-
 kernel/rcu/rcuperf.c         |  2 +-
 kernel/rcu/rcutorture.c      |  2 +-
 kernel/torture.c             | 12 ++++++------
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 66272862070b..a55e80817dae 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -79,7 +79,7 @@ void stutter_wait(const char *title);
 int torture_stutter_init(int s);
 
 /* Initialization and cleanup. */
-bool torture_init_begin(char *ttype, bool v);
+bool torture_init_begin(char *ttype, int v);
 void torture_init_end(void);
 bool torture_cleanup_begin(void);
 void torture_cleanup_end(void);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8402b3349dca..4a2e13870a9b 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -57,7 +57,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
 torture_param(int, stat_interval, 60,
 	     "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(bool, verbose, true,
+torture_param(int, verbose, 1,
 	     "Enable verbose debugging printk()s");
 
 static char *torture_type = "spin_lock";
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index e232846516b3..fb8094848906 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -88,7 +88,7 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
 torture_param(bool, shutdown, !IS_ENABLED(MODULE),
 	      "Shutdown at end of performance tests.");
-torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
 
 static char *perf_type = "rcu";
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 42fcb7f05fac..a5540bd831c4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -101,7 +101,7 @@ torture_param(int, test_boost_interval, 7,
 	     "Interval between boost tests, seconds.");
 torture_param(bool, test_no_idle_hz, true,
 	     "Test support for tickless idle CPUs");
-torture_param(bool, verbose, true,
+torture_param(int, verbose, 1,
 	     "Enable verbose debugging printk()s");
 
 static char *torture_type = "rcu";
diff --git a/kernel/torture.c b/kernel/torture.c
index 3de1efbecd6a..840fd33c1cda 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -53,7 +53,7 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 
 static char *torture_type;
-static bool verbose;
+static int verbose;
 
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 #define FULLSTOP_DONTSTOP 0	/* Normal operation. */
@@ -98,7 +98,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
 	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
 		return false;
 
-	if (verbose)
+	if (verbose > 1)
 		pr_alert("%s" TORTURE_FLAG
 			 "torture_onoff task: offlining %d\n",
 			 torture_type, cpu);
@@ -111,7 +111,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
 				 "torture_onoff task: offline %d failed: errno %d\n",
 				 torture_type, cpu, ret);
 	} else {
-		if (verbose)
+		if (verbose > 1)
 			pr_alert("%s" TORTURE_FLAG
 				 "torture_onoff task: offlined %d\n",
 				 torture_type, cpu);
@@ -147,7 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
 	if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
 		return false;
 
-	if (verbose)
+	if (verbose > 1)
 		pr_alert("%s" TORTURE_FLAG
 			 "torture_onoff task: onlining %d\n",
 			 torture_type, cpu);
@@ -160,7 +160,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
 				 "torture_onoff task: online %d failed: errno %d\n",
 				 torture_type, cpu, ret);
 	} else {
-		if (verbose)
+		if (verbose > 1)
 			pr_alert("%s" TORTURE_FLAG
 				 "torture_onoff task: onlined %d\n",
 				 torture_type, cpu);
@@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void)
  * The runnable parameter points to a flag that controls whether or not
  * the test is currently runnable.  If there is no such flag, pass in NULL.
  */
-bool torture_init_begin(char *ttype, bool v)
+bool torture_init_begin(char *ttype, int v)
 {
 	mutex_lock(&fullstop_mutex);
 	if (torture_type != NULL) {
-- 
cgit v1.2.3


From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 24 Jun 2018 14:13:49 +0900
Subject: net: Convert GRO SKB handling to list_head.

Manage pending per-NAPI GRO packets via list_head.

Return an SKB pointer from the GRO receive handlers.  When GRO receive
handlers return non-NULL, it means that this SKB needs to be completed
at this time and removed from the NAPI queue.

Several operations are greatly simplified by this transformation,
especially timing out the oldest SKB in the list when gro_count
exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue
in reverse order.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c        | 11 ++++----
 drivers/net/vxlan.c         | 11 ++++----
 include/linux/etherdevice.h |  3 +-
 include/linux/netdevice.h   | 32 ++++++++++-----------
 include/linux/skbuff.h      |  3 +-
 include/linux/udp.h         |  4 +--
 include/net/inet_common.h   |  2 +-
 include/net/tcp.h           |  2 +-
 include/net/udp.h           |  4 +--
 include/net/udp_tunnel.h    |  6 ++--
 net/8021q/vlan.c            | 13 +++++----
 net/core/dev.c              | 68 +++++++++++++++++++--------------------------
 net/core/skbuff.c           |  4 +--
 net/ethernet/eth.c          | 12 ++++----
 net/ipv4/af_inet.c          | 12 ++++----
 net/ipv4/esp4_offload.c     |  4 +--
 net/ipv4/fou.c              | 20 ++++++-------
 net/ipv4/gre_offload.c      |  8 +++---
 net/ipv4/tcp_offload.c      | 14 +++++-----
 net/ipv4/udp_offload.c      | 13 +++++----
 net/ipv6/esp6_offload.c     |  4 +--
 net/ipv6/ip6_offload.c      | 16 +++++------
 net/ipv6/tcpv6_offload.c    |  4 +--
 net/ipv6/udp_offload.c      |  4 +--
 24 files changed, 133 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 750eaa53bf0c..3e94375b9b01 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh)
 	return sizeof(*gh) + gh->opt_len * 4;
 }
 
-static struct sk_buff **geneve_gro_receive(struct sock *sk,
-					   struct sk_buff **head,
-					   struct sk_buff *skb)
+static struct sk_buff *geneve_gro_receive(struct sock *sk,
+					  struct list_head *head,
+					  struct sk_buff *skb)
 {
-	struct sk_buff *p, **pp = NULL;
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
 	struct genevehdr *gh, *gh2;
 	unsigned int hlen, gh_len, off_gnv;
 	const struct packet_offload *ptype;
@@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk,
 			goto out;
 	}
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aee0e60471f1..cc14e0cd5647 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 	return vh;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sock *sk,
-					  struct sk_buff **head,
-					  struct sk_buff *skb)
+static struct sk_buff *vxlan_gro_receive(struct sock *sk,
+					 struct list_head *head,
+					 struct sk_buff *skb)
 {
-	struct sk_buff *p, **pp = NULL;
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
 	struct vxlanhdr *vh, *vh2;
 	unsigned int hlen, off_vx;
 	int flush = 1;
@@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk,
 
 	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 79563840c295..572e11bb8696 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
 					   unsigned int rxqs);
 #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)
 
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
-				 struct sk_buff *skb);
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
 int eth_gro_complete(struct sk_buff *skb, int nhoff);
 
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..f176d9873910 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -322,7 +322,7 @@ struct napi_struct {
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
-	struct sk_buff		*gro_list;
+	struct list_head	gro_list;
 	struct sk_buff		*skb;
 	struct hrtimer		timer;
 	struct list_head	dev_list;
@@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb)
 	return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
 }
 
-typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *);
-static inline struct sk_buff **call_gro_receive(gro_receive_t cb,
-						struct sk_buff **head,
-						struct sk_buff *skb)
+typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
+static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
+					       struct list_head *head,
+					       struct sk_buff *skb)
 {
 	if (unlikely(gro_recursion_inc_test(skb))) {
 		NAPI_GRO_CB(skb)->flush |= 1;
@@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb,
 	return cb(head, skb);
 }
 
-typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **,
-					     struct sk_buff *);
-static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb,
-						   struct sock *sk,
-						   struct sk_buff **head,
-						   struct sk_buff *skb)
+typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
+					    struct sk_buff *);
+static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
+						  struct sock *sk,
+						  struct list_head *head,
+						  struct sk_buff *skb)
 {
 	if (unlikely(gro_recursion_inc_test(skb))) {
 		NAPI_GRO_CB(skb)->flush |= 1;
@@ -2299,8 +2299,8 @@ struct packet_type {
 struct offload_callbacks {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						netdev_features_t features);
-	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-						 struct sk_buff *skb);
+	struct sk_buff		*(*gro_receive)(struct list_head *head,
+						struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
 };
 
@@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
 
 static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
 {
@@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_XFRM_OFFLOAD
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
 {
 	if (PTR_ERR(pp) != -EINPROGRESS)
 		NAPI_GRO_CB(skb)->flush |= flush;
 }
 #else
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
 {
 	NAPI_GRO_CB(skb)->flush |= flush;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..7ccc601b55d9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -677,7 +677,8 @@ struct sk_buff {
 				int			ip_defrag_offset;
 			};
 		};
-		struct rb_node	rbnode; /* used in netem & tcp stack */
+		struct rb_node		rbnode; /* used in netem & tcp stack */
+		struct list_head	list;
 	};
 	struct sock		*sk;
 
diff --git a/include/linux/udp.h b/include/linux/udp.h
index ca840345571b..320d49d85484 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -74,8 +74,8 @@ struct udp_sock {
 	void (*encap_destroy)(struct sock *sk);
 
 	/* GRO functions for UDP socket */
-	struct sk_buff **	(*gro_receive)(struct sock *sk,
-					       struct sk_buff **head,
+	struct sk_buff *	(*gro_receive)(struct sock *sk,
+					       struct list_head *head,
 					       struct sk_buff *skb);
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 384b90c62c0b..3ca969cbd161 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -43,7 +43,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
 		    int *addr_len);
 
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb);
 int inet_gro_complete(struct sk_buff *skb, int nhoff);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 822ee49ed0f9..402a88b0e8a8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1788,7 +1788,7 @@ void tcp_v4_destroy_sock(struct sock *sk);
 
 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features);
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
 int tcp_gro_complete(struct sk_buff *skb);
 
 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
diff --git a/include/net/udp.h b/include/net/udp.h
index b1ea8b0f5e6a..5723c6128ae4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -170,8 +170,8 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
 typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
 				     __be16 dport);
 
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh, udp_lookup_t lookup);
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+				struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b95a6927c718..fe680ab6b15a 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -65,9 +65,9 @@ static inline int udp_sock_create(struct net *net,
 
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
-typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk,
-						     struct sk_buff **head,
-						     struct sk_buff *skb);
+typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
+						    struct list_head *head,
+						    struct sk_buff *skb);
 typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
 					 int nhoff);
 
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..99141986efa0 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -647,13 +647,14 @@ out:
 	return err;
 }
 
-static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
-	struct sk_buff *p, **pp = NULL;
-	struct vlan_hdr *vhdr;
-	unsigned int hlen, off_vlan;
 	const struct packet_offload *ptype;
+	unsigned int hlen, off_vlan;
+	struct sk_buff *pp = NULL;
+	struct vlan_hdr *vhdr;
+	struct sk_buff *p;
 	__be16 type;
 	int flush = 1;
 
@@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
 
 	flush = 0;
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		struct vlan_hdr *vhdr2;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..aa61b9344b46 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4881,36 +4881,25 @@ out:
  */
 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	struct sk_buff *skb, *prev = NULL;
-
-	/* scan list and build reverse chain */
-	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
-		skb->prev = prev;
-		prev = skb;
-	}
-
-	for (skb = prev; skb; skb = prev) {
-		skb->next = NULL;
+	struct sk_buff *skb, *p;
 
+	list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
-
-		prev = skb->prev;
+		list_del_init(&skb->list);
 		napi_gro_complete(skb);
 		napi->gro_count--;
 	}
-
-	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 {
-	struct sk_buff *p;
 	unsigned int maclen = skb->dev->hard_header_len;
 	u32 hash = skb_get_hash_raw(skb);
+	struct sk_buff *p;
 
-	for (p = napi->gro_list; p; p = p->next) {
+	list_for_each_entry(p, &napi->gro_list, list) {
 		unsigned long diffs;
 
 		NAPI_GRO_CB(p)->flush = 0;
@@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
-	struct sk_buff **pp = NULL;
+	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
-	struct list_head *head = &offload_base;
-	int same_flow;
+	struct sk_buff *pp = NULL;
 	enum gro_result ret;
+	int same_flow;
 	int grow;
 
 	if (netif_elide_gro(skb->dev))
@@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
-		struct sk_buff *nskb = *pp;
-
-		*pp = nskb->next;
-		nskb->next = NULL;
-		napi_gro_complete(nskb);
+		list_del_init(&pp->list);
+		napi_gro_complete(pp);
 		napi->gro_count--;
 	}
 
@@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		goto normal;
 
 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		struct sk_buff *nskb = napi->gro_list;
+		struct sk_buff *nskb;
 
-		/* locate the end of the list to select the 'oldest' flow */
-		while (nskb->next) {
-			pp = &nskb->next;
-			nskb = *pp;
-		}
-		*pp = NULL;
-		nskb->next = NULL;
+		nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
+		list_del(&nskb->list);
 		napi_gro_complete(nskb);
 	} else {
 		napi->gro_count++;
@@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-	skb->next = napi->gro_list;
-	napi->gro_list = skb;
+	list_add(&skb->list, &napi->gro_list);
 	ret = GRO_HELD;
 
 pull:
@@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (n->gro_list) {
+	if (!list_empty(&n->gro_list)) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (napi->gro_list && !napi_disable_pending(napi) &&
+	if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	napi->gro_list = NULL;
+	INIT_LIST_HEAD(&napi->gro_list);
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_disable);
 
+static void gro_list_free(struct list_head *head)
+{
+	struct sk_buff *skb, *p;
+
+	list_for_each_entry_safe(skb, p, head, list)
+		kfree_skb(skb);
+}
+
 /* Must be called in process context */
 void netif_napi_del(struct napi_struct *napi)
 {
@@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi)
 	list_del_init(&napi->dev_list);
 	napi_free_frags(napi);
 
-	kfree_skb_list(napi->gro_list);
-	napi->gro_list = NULL;
+	gro_list_free(&napi->gro_list);
+	INIT_LIST_HEAD(&napi->gro_list);
 	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (n->gro_list) {
+	if (!list_empty(&n->gro_list)) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..b1f274f22d85 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3815,14 +3815,14 @@ err:
 }
 EXPORT_SYMBOL_GPL(skb_segment);
 
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 {
 	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
 	unsigned int offset = skb_gro_offset(skb);
 	unsigned int headlen = skb_headlen(skb);
 	unsigned int len = skb_gro_len(skb);
-	struct sk_buff *lp, *p = *head;
 	unsigned int delta_truesize;
+	struct sk_buff *lp;
 
 	if (unlikely(p->len + len >= 65536))
 		return -E2BIG;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ee28440f57c5..fd8faa0dfa61 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 }
 EXPORT_SYMBOL(sysfs_format_mac);
 
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
-				 struct sk_buff *skb)
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
-	struct sk_buff *p, **pp = NULL;
-	struct ethhdr *eh, *eh2;
-	unsigned int hlen, off_eth;
 	const struct packet_offload *ptype;
+	unsigned int hlen, off_eth;
+	struct sk_buff *pp = NULL;
+	struct ethhdr *eh, *eh2;
+	struct sk_buff *p;
 	__be16 type;
 	int flush = 1;
 
@@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
 
 	flush = 0;
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..06b218a2870f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1384,12 +1384,12 @@ out:
 }
 EXPORT_SYMBOL(inet_gso_segment);
 
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	const struct net_offload *ops;
-	struct sk_buff **pp = NULL;
-	struct sk_buff *p;
+	struct sk_buff *pp = NULL;
 	const struct iphdr *iph;
+	struct sk_buff *p;
 	unsigned int hlen;
 	unsigned int off;
 	unsigned int id;
@@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
 	id >>= 16;
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		struct iphdr *iph2;
 		u16 flush_id;
 
@@ -1505,8 +1505,8 @@ out:
 }
 EXPORT_SYMBOL(inet_gro_receive);
 
-static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	if (NAPI_GRO_CB(skb)->encap_mark) {
 		NAPI_GRO_CB(skb)->flush = 1;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7cf755ef9efb..bbeecd13e534 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -28,8 +28,8 @@
 #include <linux/spinlock.h>
 #include <net/udp.h>
 
-static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	int offset = skb_gro_offset(skb);
 	struct xfrm_offload *xo;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..efdc9e1f741e 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -224,14 +224,14 @@ drop:
 	return 0;
 }
 
-static struct sk_buff **fou_gro_receive(struct sock *sk,
-					struct sk_buff **head,
-					struct sk_buff *skb)
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+				       struct list_head *head,
+				       struct sk_buff *skb)
 {
-	const struct net_offload *ops;
-	struct sk_buff **pp = NULL;
 	u8 proto = fou_from_sock(sk)->protocol;
 	const struct net_offload **offloads;
+	const struct net_offload *ops;
+	struct sk_buff *pp = NULL;
 
 	/* We can clear the encap_mark for FOU as we are essentially doing
 	 * one of two possible things.  We are either adding an L4 tunnel
@@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	return guehdr;
 }
 
-static struct sk_buff **gue_gro_receive(struct sock *sk,
-					struct sk_buff **head,
-					struct sk_buff *skb)
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+				       struct list_head *head,
+				       struct sk_buff *skb)
 {
 	const struct net_offload **offloads;
 	const struct net_offload *ops;
-	struct sk_buff **pp = NULL;
+	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	struct guehdr *guehdr;
 	size_t len, optlen, hdrlen, off;
@@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
 
 	skb_gro_pull(skb, hdrlen);
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		const struct guehdr *guehdr2;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1859c473b21a..b9673c21be45 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -108,10 +108,10 @@ out:
 	return segs;
 }
 
-static struct sk_buff **gre_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+				       struct sk_buff *skb)
 {
-	struct sk_buff **pp = NULL;
+	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	const struct gre_base_hdr *greh;
 	unsigned int hlen, grehlen;
@@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 					     null_compute_pseudo);
 	}
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		const struct gre_base_hdr *greh2;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8cc7c3487330..f5aee641f825 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -180,9 +180,9 @@ out:
 	return segs;
 }
 
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
-	struct sk_buff **pp = NULL;
+	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	struct tcphdr *th;
 	struct tcphdr *th2;
@@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	len = skb_gro_len(skb);
 	flags = tcp_flag_word(th);
 
-	for (; (p = *head); head = &p->next) {
+	list_for_each_entry(p, head, list) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
@@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 		goto found;
 	}
-
+	p = NULL;
 	goto out_check_final;
 
 found:
@@ -263,7 +263,7 @@ found:
 	flush |= (len - 1) >= mss;
 	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 
-	if (flush || skb_gro_receive(head, skb)) {
+	if (flush || skb_gro_receive(p, skb)) {
 		mss = 1;
 		goto out_check_final;
 	}
@@ -277,7 +277,7 @@ out_check_final:
 					TCP_FLAG_FIN));
 
 	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
-		pp = head;
+		pp = p;
 
 out:
 	NAPI_GRO_CB(skb)->flush |= (flush != 0);
@@ -302,7 +302,7 @@ int tcp_gro_complete(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_gro_complete);
 
-static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92dc9e5a7ff3..ac46c1c55c99 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,10 +343,11 @@ out:
 	return segs;
 }
 
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh, udp_lookup_t lookup)
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+				struct udphdr *uh, udp_lookup_t lookup)
 {
-	struct sk_buff *p, **pp = NULL;
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
 	struct udphdr *uh2;
 	unsigned int off = skb_gro_offset(skb);
 	int flush = 1;
@@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 unflush:
 	flush = 0;
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
@@ -399,8 +400,8 @@ out:
 }
 EXPORT_SYMBOL(udp_gro_receive);
 
-static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *udp4_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 27f59b61f70f..ddfa533a84e5 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
 	return 0;
 }
 
-static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *esp6_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	int offset = skb_gro_offset(skb);
 	struct xfrm_offload *xo;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..37ff4805b20c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
 	return len;
 }
 
-static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *ipv6_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	const struct net_offload *ops;
-	struct sk_buff **pp = NULL;
+	struct sk_buff *pp = NULL;
 	struct sk_buff *p;
 	struct ipv6hdr *iph;
 	unsigned int nlen;
@@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	flush--;
 	nlen = skb_network_header_len(skb);
 
-	for (p = *head; p; p = p->next) {
+	list_for_each_entry(p, head, list) {
 		const struct ipv6hdr *iph2;
 		__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
 
@@ -263,8 +263,8 @@ out:
 	return pp;
 }
 
-static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
-					       struct sk_buff *skb)
+static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
+					      struct sk_buff *skb)
 {
 	/* Common GRO receive for SIT and IP6IP6 */
 
@@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
 	return ipv6_gro_receive(head, skb);
 }
 
-static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
-					   struct sk_buff *skb)
+static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
+					  struct sk_buff *skb)
 {
 	/* Common GRO receive for SIT and IP6IP6 */
 
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 278e49cd67d4..e72947c99454 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,8 +15,8 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *tcp6_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 03a2ff3fe1e6..95dee9ca8d22 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -114,8 +114,8 @@ out:
 	return segs;
 }
 
-static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
-					 struct sk_buff *skb)
+static struct sk_buff *udp6_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
 {
 	struct udphdr *uh = udp_gro_udphdr(skb);
 
-- 
cgit v1.2.3


From 07d78363dcffd9cb1bf6f06a6cac0e0847f3c1de Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 24 Jun 2018 14:14:02 +0900
Subject: net: Convert NAPI gro list into a small hash table.

Improve the performance of GRO receive by splitting flows into
multiple hash chains.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   3 +-
 net/core/dev.c            | 105 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 81 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f176d9873910..c6b377a15869 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str);
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
+#define GRO_HASH_BUCKETS	8
 struct napi_struct {
 	/* The poll_list must only be managed by the entity which
 	 * changes the state of the NAPI_STATE_SCHED bit.  This means
@@ -322,7 +323,7 @@ struct napi_struct {
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
-	struct list_head	gro_list;
+	struct list_head	gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
 	struct hrtimer		timer;
 	struct list_head	dev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index aa61b9344b46..dffed642e686 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4875,15 +4875,12 @@ out:
 	return netif_receive_skb_internal(skb);
 }
 
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head,
+				   bool flush_old)
 {
 	struct sk_buff *skb, *p;
 
-	list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
+	list_for_each_entry_safe_reverse(skb, p, head, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
 		list_del_init(&skb->list);
@@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 		napi->gro_count--;
 	}
 }
+
+/* napi->gro_hash contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+	int i;
+
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct list_head *head = &napi->gro_hash[i];
+
+		__napi_gro_flush_chain(napi, head, flush_old);
+	}
+}
 EXPORT_SYMBOL(napi_gro_flush);
 
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+					  struct sk_buff *skb)
 {
 	unsigned int maclen = skb->dev->hard_header_len;
 	u32 hash = skb_get_hash_raw(skb);
+	struct list_head *head;
 	struct sk_buff *p;
 
-	list_for_each_entry(p, &napi->gro_list, list) {
+	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+	list_for_each_entry(p, head, list) {
 		unsigned long diffs;
 
 		NAPI_GRO_CB(p)->flush = 0;
@@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 				       maclen);
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 	}
+
+	return head;
 }
 
 static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 	}
 }
 
+static void gro_flush_oldest(struct napi_struct *napi)
+{
+	struct sk_buff *oldest = NULL;
+	unsigned long age = jiffies;
+	int i;
+
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct list_head *head = &napi->gro_hash[i];
+		struct sk_buff *skb;
+
+		if (list_empty(head))
+			continue;
+
+		skb = list_last_entry(head, struct sk_buff, list);
+		if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
+			oldest = skb;
+			age = NAPI_GRO_CB(skb)->age;
+		}
+	}
+
+	/* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+	 * impossible.
+	 */
+	if (WARN_ON_ONCE(!oldest))
+		return;
+
+	/* Do not adjust napi->gro_count, caller is adding a new SKB to
+	 * the chain.
+	 */
+	list_del(&oldest->list);
+	napi_gro_complete(oldest);
+}
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
+	struct list_head *gro_head;
 	struct sk_buff *pp = NULL;
 	enum gro_result ret;
 	int same_flow;
@@ -4977,7 +5028,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (netif_elide_gro(skb->dev))
 		goto normal;
 
-	gro_list_prepare(napi, skb);
+	gro_head = gro_list_prepare(napi, skb);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
@@ -5011,7 +5062,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 			NAPI_GRO_CB(skb)->csum_valid = 0;
 		}
 
-		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+		pp = ptype->callbacks.gro_receive(gro_head, skb);
 		break;
 	}
 	rcu_read_unlock();
@@ -5040,11 +5091,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		goto normal;
 
 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		struct sk_buff *nskb;
-
-		nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
-		list_del(&nskb->list);
-		napi_gro_complete(nskb);
+		gro_flush_oldest(napi);
 	} else {
 		napi->gro_count++;
 	}
@@ -5052,7 +5099,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-	list_add(&skb->list, &napi->gro_list);
+	list_add(&skb->list, gro_head);
 	ret = GRO_HELD;
 
 pull:
@@ -5458,7 +5505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (!list_empty(&n->gro_list)) {
+	if (n->gro_count) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -5667,7 +5714,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
+	if (napi->gro_count && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -5677,11 +5724,14 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
+	int i;
+
 	INIT_LIST_HEAD(&napi->poll_list);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	INIT_LIST_HEAD(&napi->gro_list);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++)
+		INIT_LIST_HEAD(&napi->gro_hash[i]);
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -5714,12 +5764,16 @@ void napi_disable(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_disable);
 
-static void gro_list_free(struct list_head *head)
+static void flush_gro_hash(struct napi_struct *napi)
 {
-	struct sk_buff *skb, *p;
+	int i;
 
-	list_for_each_entry_safe(skb, p, head, list)
-		kfree_skb(skb);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		struct sk_buff *skb, *n;
+
+		list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list)
+			kfree_skb(skb);
+	}
 }
 
 /* Must be called in process context */
@@ -5731,8 +5785,7 @@ void netif_napi_del(struct napi_struct *napi)
 	list_del_init(&napi->dev_list);
 	napi_free_frags(napi);
 
-	gro_list_free(&napi->gro_list);
-	INIT_LIST_HEAD(&napi->gro_list);
+	flush_gro_hash(napi);
 	napi->gro_count = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
@@ -5775,7 +5828,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (!list_empty(&n->gro_list)) {
+	if (n->gro_count) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
-- 
cgit v1.2.3


From a8f62d0c6fe533e07cd1acce7588278f9d6e7720 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Mon, 25 Jun 2018 20:37:08 +0800
Subject: ptp: support DPAA FMan 1588 timer in ptp_qoriq

This patch is to support DPAA (Data Path Acceleration Architecture)
1588 timer by adding "fsl,fman-ptp-timer" compatible, sharing
interrupt with FMan, adding FSL_DPAA_ETH dependency, and fixing
up register offset.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Madalin Bucur <madalin.bucur@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/Kconfig           |   2 +-
 drivers/ptp/ptp_qoriq.c       | 104 +++++++++++++++++++++++++++---------------
 include/linux/fsl/ptp_qoriq.h |  38 ++++++++++++---
 3 files changed, 98 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 474c988d2e95..d137c480db46 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -43,7 +43,7 @@ config PTP_1588_CLOCK_DTE
 
 config PTP_1588_CLOCK_QORIQ
 	tristate "Freescale QorIQ 1588 timer as PTP clock"
-	depends on GIANFAR
+	depends on GIANFAR || FSL_DPAA_ETH
 	depends on PTP_1588_CLOCK
 	default y
 	help
diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index e8652c148c52..a14c317b5a38 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -39,11 +39,12 @@
 /* Caller must hold qoriq_ptp->lock. */
 static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u64 ns;
 	u32 lo, hi;
 
-	lo = qoriq_read(&qoriq_ptp->regs->tmr_cnt_l);
-	hi = qoriq_read(&qoriq_ptp->regs->tmr_cnt_h);
+	lo = qoriq_read(&regs->ctrl_regs->tmr_cnt_l);
+	hi = qoriq_read(&regs->ctrl_regs->tmr_cnt_h);
 	ns = ((u64) hi) << 32;
 	ns |= lo;
 	return ns;
@@ -52,16 +53,18 @@ static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp)
 /* Caller must hold qoriq_ptp->lock. */
 static void tmr_cnt_write(struct qoriq_ptp *qoriq_ptp, u64 ns)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u32 hi = ns >> 32;
 	u32 lo = ns & 0xffffffff;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_cnt_l, lo);
-	qoriq_write(&qoriq_ptp->regs->tmr_cnt_h, hi);
+	qoriq_write(&regs->ctrl_regs->tmr_cnt_l, lo);
+	qoriq_write(&regs->ctrl_regs->tmr_cnt_h, hi);
 }
 
 /* Caller must hold qoriq_ptp->lock. */
 static void set_alarm(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u64 ns;
 	u32 lo, hi;
 
@@ -70,16 +73,18 @@ static void set_alarm(struct qoriq_ptp *qoriq_ptp)
 	ns -= qoriq_ptp->tclk_period;
 	hi = ns >> 32;
 	lo = ns & 0xffffffff;
-	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_l, lo);
-	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_h, hi);
+	qoriq_write(&regs->alarm_regs->tmr_alarm1_l, lo);
+	qoriq_write(&regs->alarm_regs->tmr_alarm1_h, hi);
 }
 
 /* Caller must hold qoriq_ptp->lock. */
 static void set_fipers(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
+
 	set_alarm(qoriq_ptp);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+	qoriq_write(&regs->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&regs->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
 }
 
 /*
@@ -89,16 +94,17 @@ static void set_fipers(struct qoriq_ptp *qoriq_ptp)
 static irqreturn_t isr(int irq, void *priv)
 {
 	struct qoriq_ptp *qoriq_ptp = priv;
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	struct ptp_clock_event event;
 	u64 ns;
 	u32 ack = 0, lo, hi, mask, val;
 
-	val = qoriq_read(&qoriq_ptp->regs->tmr_tevent);
+	val = qoriq_read(&regs->ctrl_regs->tmr_tevent);
 
 	if (val & ETS1) {
 		ack |= ETS1;
-		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts1_h);
-		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts1_l);
+		hi = qoriq_read(&regs->etts_regs->tmr_etts1_h);
+		lo = qoriq_read(&regs->etts_regs->tmr_etts1_l);
 		event.type = PTP_CLOCK_EXTTS;
 		event.index = 0;
 		event.timestamp = ((u64) hi) << 32;
@@ -108,8 +114,8 @@ static irqreturn_t isr(int irq, void *priv)
 
 	if (val & ETS2) {
 		ack |= ETS2;
-		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts2_h);
-		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts2_l);
+		hi = qoriq_read(&regs->etts_regs->tmr_etts2_h);
+		lo = qoriq_read(&regs->etts_regs->tmr_etts2_l);
 		event.type = PTP_CLOCK_EXTTS;
 		event.index = 1;
 		event.timestamp = ((u64) hi) << 32;
@@ -130,16 +136,16 @@ static irqreturn_t isr(int irq, void *priv)
 			hi = ns >> 32;
 			lo = ns & 0xffffffff;
 			spin_lock(&qoriq_ptp->lock);
-			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_l, lo);
-			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_h, hi);
+			qoriq_write(&regs->alarm_regs->tmr_alarm2_l, lo);
+			qoriq_write(&regs->alarm_regs->tmr_alarm2_h, hi);
 			spin_unlock(&qoriq_ptp->lock);
 			qoriq_ptp->alarm_value = ns;
 		} else {
-			qoriq_write(&qoriq_ptp->regs->tmr_tevent, ALM2);
+			qoriq_write(&regs->ctrl_regs->tmr_tevent, ALM2);
 			spin_lock(&qoriq_ptp->lock);
-			mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+			mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 			mask &= ~ALM2EN;
-			qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+			qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 			spin_unlock(&qoriq_ptp->lock);
 			qoriq_ptp->alarm_value = 0;
 			qoriq_ptp->alarm_interval = 0;
@@ -153,7 +159,7 @@ static irqreturn_t isr(int irq, void *priv)
 	}
 
 	if (ack) {
-		qoriq_write(&qoriq_ptp->regs->tmr_tevent, ack);
+		qoriq_write(&regs->ctrl_regs->tmr_tevent, ack);
 		return IRQ_HANDLED;
 	} else
 		return IRQ_NONE;
@@ -169,6 +175,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 	u32 tmr_add;
 	int neg_adj = 0;
 	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 
 	if (scaled_ppm < 0) {
 		neg_adj = 1;
@@ -186,7 +193,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 
 	tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_add, tmr_add);
+	qoriq_write(&regs->ctrl_regs->tmr_add, tmr_add);
 
 	return 0;
 }
@@ -250,6 +257,7 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
 			      struct ptp_clock_request *rq, int on)
 {
 	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	unsigned long flags;
 	u32 bit, mask;
 
@@ -266,23 +274,23 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
 			return -EINVAL;
 		}
 		spin_lock_irqsave(&qoriq_ptp->lock, flags);
-		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 		if (on)
 			mask |= bit;
 		else
 			mask &= ~bit;
-		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 		return 0;
 
 	case PTP_CLK_REQ_PPS:
 		spin_lock_irqsave(&qoriq_ptp->lock, flags);
-		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 		if (on)
 			mask |= PP1EN;
 		else
 			mask &= ~PP1EN;
-		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 		return 0;
 
@@ -313,10 +321,12 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 {
 	struct device_node *node = dev->dev.of_node;
 	struct qoriq_ptp *qoriq_ptp;
+	struct qoriq_ptp_registers *regs;
 	struct timespec64 now;
 	int err = -ENOMEM;
 	u32 tmr_ctrl;
 	unsigned long flags;
+	void __iomem *base;
 
 	qoriq_ptp = kzalloc(sizeof(*qoriq_ptp), GFP_KERNEL);
 	if (!qoriq_ptp)
@@ -351,7 +361,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 		pr_err("irq not in device tree\n");
 		goto no_node;
 	}
-	if (request_irq(qoriq_ptp->irq, isr, 0, DRIVER, qoriq_ptp)) {
+	if (request_irq(qoriq_ptp->irq, isr, IRQF_SHARED, DRIVER, qoriq_ptp)) {
 		pr_err("request_irq failed\n");
 		goto no_node;
 	}
@@ -368,12 +378,27 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 
 	spin_lock_init(&qoriq_ptp->lock);
 
-	qoriq_ptp->regs = ioremap(qoriq_ptp->rsrc->start,
-				resource_size(qoriq_ptp->rsrc));
-	if (!qoriq_ptp->regs) {
+	base = ioremap(qoriq_ptp->rsrc->start,
+		       resource_size(qoriq_ptp->rsrc));
+	if (!base) {
 		pr_err("ioremap ptp registers failed\n");
 		goto no_ioremap;
 	}
+
+	qoriq_ptp->base = base;
+
+	if (of_device_is_compatible(node, "fsl,fman-ptp-timer")) {
+		qoriq_ptp->regs.ctrl_regs = base + FMAN_CTRL_REGS_OFFSET;
+		qoriq_ptp->regs.alarm_regs = base + FMAN_ALARM_REGS_OFFSET;
+		qoriq_ptp->regs.fiper_regs = base + FMAN_FIPER_REGS_OFFSET;
+		qoriq_ptp->regs.etts_regs = base + FMAN_ETTS_REGS_OFFSET;
+	} else {
+		qoriq_ptp->regs.ctrl_regs = base + CTRL_REGS_OFFSET;
+		qoriq_ptp->regs.alarm_regs = base + ALARM_REGS_OFFSET;
+		qoriq_ptp->regs.fiper_regs = base + FIPER_REGS_OFFSET;
+		qoriq_ptp->regs.etts_regs = base + ETTS_REGS_OFFSET;
+	}
+
 	ktime_get_real_ts64(&now);
 	ptp_qoriq_settime(&qoriq_ptp->caps, &now);
 
@@ -383,13 +408,14 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 
 	spin_lock_irqsave(&qoriq_ptp->lock, flags);
 
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl);
-	qoriq_write(&qoriq_ptp->regs->tmr_add,    qoriq_ptp->tmr_add);
-	qoriq_write(&qoriq_ptp->regs->tmr_prsc,   qoriq_ptp->tmr_prsc);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+	regs = &qoriq_ptp->regs;
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   tmr_ctrl);
+	qoriq_write(&regs->ctrl_regs->tmr_add,    qoriq_ptp->tmr_add);
+	qoriq_write(&regs->ctrl_regs->tmr_prsc,   qoriq_ptp->tmr_prsc);
+	qoriq_write(&regs->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&regs->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
 	set_alarm(qoriq_ptp);
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
 
 	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 
@@ -405,7 +431,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 	return 0;
 
 no_clock:
-	iounmap(qoriq_ptp->regs);
+	iounmap(qoriq_ptp->base);
 no_ioremap:
 	release_resource(qoriq_ptp->rsrc);
 no_resource:
@@ -419,12 +445,13 @@ no_memory:
 static int qoriq_ptp_remove(struct platform_device *dev)
 {
 	struct qoriq_ptp *qoriq_ptp = platform_get_drvdata(dev);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_temask, 0);
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   0);
+	qoriq_write(&regs->ctrl_regs->tmr_temask, 0);
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   0);
 
 	ptp_clock_unregister(qoriq_ptp->clock);
-	iounmap(qoriq_ptp->regs);
+	iounmap(qoriq_ptp->base);
 	release_resource(qoriq_ptp->rsrc);
 	free_irq(qoriq_ptp->irq, qoriq_ptp);
 	kfree(qoriq_ptp);
@@ -434,6 +461,7 @@ static int qoriq_ptp_remove(struct platform_device *dev)
 
 static const struct of_device_id match_table[] = {
 	{ .compatible = "fsl,etsec-ptp" },
+	{ .compatible = "fsl,fman-ptp-timer" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, match_table);
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
index b462d9ea8007..dc3dac40f069 100644
--- a/include/linux/fsl/ptp_qoriq.h
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -11,9 +11,8 @@
 
 /*
  * qoriq ptp registers
- * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
  */
-struct qoriq_ptp_registers {
+struct ctrl_regs {
 	u32 tmr_ctrl;     /* Timer control register */
 	u32 tmr_tevent;   /* Timestamp event register */
 	u32 tmr_temask;   /* Timer event mask register */
@@ -28,22 +27,47 @@ struct qoriq_ptp_registers {
 	u8  res1[4];
 	u32 tmroff_h;     /* Timer offset high */
 	u32 tmroff_l;     /* Timer offset low */
-	u8  res2[8];
+};
+
+struct alarm_regs {
 	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
 	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
 	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
 	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
-	u8  res3[48];
+};
+
+struct fiper_regs {
 	u32 tmr_fiper1;   /* Timer fixed period interval */
 	u32 tmr_fiper2;   /* Timer fixed period interval */
 	u32 tmr_fiper3;   /* Timer fixed period interval */
-	u8  res4[20];
+};
+
+struct etts_regs {
 	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
 };
 
+struct qoriq_ptp_registers {
+	struct ctrl_regs __iomem *ctrl_regs;
+	struct alarm_regs __iomem *alarm_regs;
+	struct fiper_regs __iomem *fiper_regs;
+	struct etts_regs __iomem *etts_regs;
+};
+
+/* Offset definitions for the four register groups */
+#define CTRL_REGS_OFFSET	0x0
+#define ALARM_REGS_OFFSET	0x40
+#define FIPER_REGS_OFFSET	0x80
+#define ETTS_REGS_OFFSET	0xa0
+
+#define FMAN_CTRL_REGS_OFFSET	0x80
+#define FMAN_ALARM_REGS_OFFSET	0xb8
+#define FMAN_FIPER_REGS_OFFSET	0xd0
+#define FMAN_ETTS_REGS_OFFSET	0xe0
+
+
 /* Bit definitions for the TMR_CTRL register */
 #define ALM1P                 (1<<31) /* Alarm1 output polarity */
 #define ALM2P                 (1<<30) /* Alarm2 output polarity */
@@ -105,10 +129,10 @@ struct qoriq_ptp_registers {
 #define DRIVER		"ptp_qoriq"
 #define DEFAULT_CKSEL	1
 #define N_EXT_TS	2
-#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
 
 struct qoriq_ptp {
-	struct qoriq_ptp_registers __iomem *regs;
+	void __iomem *base;
+	struct qoriq_ptp_registers regs;
 	spinlock_t lock; /* protects regs */
 	struct ptp_clock *clock;
 	struct ptp_clock_info caps;
-- 
cgit v1.2.3


From b03799b0cb35dbc39e89602b1203863e2a6e06bf Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 25 Jun 2018 16:49:06 -0500
Subject: PCI: shpchp: Separate existence of SHPC and permission to use it

The shpchp driver registers for all PCI bridge devices.  Its probe method
should fail if either (1) the bridge doesn't have an SHPC or (2) the OS
isn't allowed to use it (the platform firmware may be operating the SHPC
itself).

Separate these two tests into:

  - A new shpc_capable() that looks for the SHPC hardware and is applicable
    on all systems (ACPI and non-ACPI), and

  - A simplified acpi_get_hp_hw_control_from_firmware() that we call only
    when we already know an SHPC exists and there may be ACPI methods to
    either request permission to use it (_OSC) or transfer control to the
    OS (OSHP).

acpi_get_hp_hw_control_from_firmware() is implemented when CONFIG_ACPI=y,
but does nothing if the current platform doesn't support ACPI.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/pci/hotplug/acpi_pcihp.c  | 36 +++++++++++++++++++-----------------
 drivers/pci/hotplug/shpchp_core.c | 21 +++++++++++++++++++++
 drivers/pci/pci-acpi.c            | 19 +------------------
 include/linux/pci.h               |  1 +
 4 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index 5bd6c1573295..6b7c1ed58e7e 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -73,20 +73,6 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev)
 	acpi_handle chandle, handle;
 	struct acpi_buffer string = { ACPI_ALLOCATE_BUFFER, NULL };
 
-	/*
-	 * Per PCI firmware specification, we should run the ACPI _OSC
-	 * method to get control of hotplug hardware before using it. If
-	 * an _OSC is missing, we look for an OSHP to do the same thing.
-	 * To handle different BIOS behavior, we look for _OSC on a root
-	 * bridge preferentially (according to PCI fw spec). Later for
-	 * OSHP within the scope of the hotplug controller and its parents,
-	 * up to the host bridge under which this controller exists.
-	 */
-	if (shpchp_is_native(pdev))
-		return 0;
-
-	/* If _OSC exists, we should not evaluate OSHP */
-
 	/*
 	 * If there's no ACPI host bridge (i.e., ACPI support is compiled
 	 * into the kernel but the hardware platform doesn't support ACPI),
@@ -97,9 +83,25 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev)
 	if (!root)
 		return 0;
 
-	if (root->osc_support_set)
-		goto no_control;
+	/*
+	 * If _OSC exists, it determines whether we're allowed to manage
+	 * the SHPC.  We executed it while enumerating the host bridge.
+	 */
+	if (root->osc_support_set) {
+		if (host->native_shpc_hotplug)
+			return 0;
+		return -ENODEV;
+	}
 
+	/*
+	 * In the absence of _OSC, we're always allowed to manage the SHPC.
+	 * However, if an OSHP method is present, we must execute it so the
+	 * firmware can transfer control to the OS, e.g., direct interrupts
+	 * to the OS instead of to the firmware.
+	 *
+	 * N.B. The PCI Firmware Spec (r3.2, sec 4.8) does not endorse
+	 * searching up the ACPI hierarchy, so the loops below are suspect.
+	 */
 	handle = ACPI_HANDLE(&pdev->dev);
 	if (!handle) {
 		/*
@@ -128,7 +130,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev)
 		if (ACPI_FAILURE(status))
 			break;
 	}
-no_control:
+
 	pci_info(pdev, "Cannot get control of SHPC hotplug\n");
 	kfree(string.pointer);
 	return -ENODEV;
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index e91be287f292..8e3c6ce12f31 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -270,11 +270,30 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	return 0;
 }
 
+static bool shpc_capable(struct pci_dev *bridge)
+{
+	/*
+	 * It is assumed that AMD GOLAM chips support SHPC but they do not
+	 * have SHPC capability.
+	 */
+	if (bridge->vendor == PCI_VENDOR_ID_AMD &&
+	    bridge->device == PCI_DEVICE_ID_AMD_GOLAM_7450)
+		return true;
+
+	if (pci_find_capability(bridge, PCI_CAP_ID_SHPC))
+		return true;
+
+	return false;
+}
+
 static int shpc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	int rc;
 	struct controller *ctrl;
 
+	if (!shpc_capable(pdev))
+		return -ENODEV;
+
 	if (acpi_get_hp_hw_control_from_firmware(pdev))
 		return -ENODEV;
 
@@ -303,6 +322,7 @@ static int shpc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto err_cleanup_slots;
 
+	pdev->shpc_managed = 1;
 	return 0;
 
 err_cleanup_slots:
@@ -319,6 +339,7 @@ static void shpc_remove(struct pci_dev *dev)
 {
 	struct controller *ctrl = pci_get_drvdata(dev);
 
+	dev->shpc_managed = 0;
 	shpchp_remove_ctrl_files(ctrl);
 	ctrl->hpc_ops->release_ctlr(ctrl);
 	kfree(ctrl);
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 65113b6eed14..5100fd2d5a75 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -403,24 +403,7 @@ bool pciehp_is_native(struct pci_dev *bridge)
  */
 bool shpchp_is_native(struct pci_dev *bridge)
 {
-	const struct pci_host_bridge *host;
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC))
-		return false;
-
-	/*
-	 * It is assumed that AMD GOLAM chips support SHPC but they do not
-	 * have SHPC capability.
-	 */
-	if (bridge->vendor == PCI_VENDOR_ID_AMD &&
-	    bridge->device == PCI_DEVICE_ID_AMD_GOLAM_7450)
-		return true;
-
-	if (!pci_find_capability(bridge, PCI_CAP_ID_SHPC))
-		return false;
-
-	host = pci_find_host_bridge(bridge->bus);
-	return host->native_shpc_hotplug;
+	return bridge->shpc_managed;
 }
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..f776a1cce120 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -388,6 +388,7 @@ struct pci_dev {
 	unsigned int	is_virtfn:1;
 	unsigned int	reset_fn:1;
 	unsigned int	is_hotplug_bridge:1;
+	unsigned int	shpc_managed:1;		/* SHPC owned by shpchp */
 	unsigned int	is_thunderbolt:1;	/* Thunderbolt controller */
 	unsigned int	__aer_firmware_first_valid:1;
 	unsigned int	__aer_firmware_first:1;
-- 
cgit v1.2.3


From 22eceb8bf3e8f1f9b2f566062d06b25807725d7f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 19 Jun 2018 13:57:26 +0200
Subject: printk: Make CONSOLE_LOGLEVEL_QUIET configurable

The goal of passing the "quiet" option to the kernel is for the kernel
to be quiet unless something really is wrong.

Sofar passing quiet has been (mostly) equivalent to passing
loglevel=4 on the kernel commandline. Which means to show any messages
with a level of KERN_ERR or higher severity on the console.

In practice this often does not result in a quiet boot though, since
there are many false-positive or otherwise harmless error messages printed,
defeating the purpose of the quiet option. Esp. the ACPICA code is really
bad wrt this, but there are plenty of others too.

This commit makes CONSOLE_LOGLEVEL_QUIET configurable.

This for example will allow distros which want quiet to really mean quiet
to set CONSOLE_LOGLEVEL_QUIET so that only messages with a higher severity
then KERN_ERR (CRIT, ALERT, EMERG) get printed, avoiding an endless game
of whack-a-mole silencing harmless error messages.

Link: http://lkml.kernel.org/r/20180619115726.3098-1-hdegoede@redhat.com
To: Petr Mladek <pmladek@suse.com>
To: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h |  6 +++---
 lib/Kconfig.debug      | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6d7e800affd8..18602bb3eca8 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -50,15 +50,15 @@ static inline const char *printk_skip_headers(const char *buffer)
 /* We show everything that is MORE important than this.. */
 #define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
 #define CONSOLE_LOGLEVEL_MIN	 1 /* Minimum loglevel we let people use */
-#define CONSOLE_LOGLEVEL_QUIET	 4 /* Shhh ..., when booted with "quiet" */
 #define CONSOLE_LOGLEVEL_DEBUG	10 /* issue debug messages */
 #define CONSOLE_LOGLEVEL_MOTORMOUTH 15	/* You can't shut this one up */
 
 /*
- * Default used to be hard-coded at 7, we're now allowing it to be set from
- * kernel config.
+ * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
+ * we're now allowing both to be set from kernel config.
  */
 #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
+#define CONSOLE_LOGLEVEL_QUIET	 CONFIG_CONSOLE_LOGLEVEL_QUIET
 
 extern int console_printk[];
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 76555479ae36..a7ef03009e9e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -30,6 +30,17 @@ config CONSOLE_LOGLEVEL_DEFAULT
 	  usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
 	  option.
 
+config CONSOLE_LOGLEVEL_QUIET
+	int "quiet console loglevel (1-15)"
+	range 1 15
+	default "4"
+	help
+	  loglevel to use when "quiet" is passed on the kernel commandline.
+
+	  When "quiet" is passed on the kernel commandline this loglevel
+	  will be used as the loglevel. IOW passing "quiet" will be the
+	  equivalent of passing "loglevel=<CONSOLE_LOGLEVEL_QUIET>"
+
 config MESSAGE_LOGLEVEL_DEFAULT
 	int "Default message log level (1-7)"
 	range 1 7
-- 
cgit v1.2.3


From e7d4a95da86e0b048702765bbdcdc968aaf312e7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 20 Jun 2018 08:58:28 +0200
Subject: bitfield: fix *_encode_bits()

There's a bug in *_encode_bits() in using ~field_multiplier() for
the check whether or not the constant value fits into the field,
this is wrong and clearly ~field_mask() was intended. This was
triggering for me for both constant and non-constant values.

Additionally, make this case actually into an compile error.
Declaring the extern function that will never exist with just a
warning is pointless as then later we'll just get a link error.

While at it, also fix the indentation in those lines I'm touching.

Finally, as suggested by Andy Shevchenko, add some tests and for
that introduce also u8 helpers. The tests don't compile without
the fix, showing that it's necessary.

Fixes: 00b0c9b82663 ("Add primitives for manipulating bitfields both in host- and fixed-endian.")
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/bitfield.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index cf2588d81148..147a7bb341dd 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -104,7 +104,7 @@
 		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
 	})
 
-extern void __compiletime_warning("value doesn't fit into mask")
+extern void __compiletime_error("value doesn't fit into mask")
 __field_overflow(void);
 extern void __compiletime_error("bad bitfield mask")
 __bad_mask(void);
@@ -121,8 +121,8 @@ static __always_inline u64 field_mask(u64 field)
 #define ____MAKE_OP(type,base,to,from)					\
 static __always_inline __##type type##_encode_bits(base v, base field)	\
 {									\
-        if (__builtin_constant_p(v) &&	(v & ~field_multiplier(field)))	\
-			    __field_overflow();				\
+	if (__builtin_constant_p(v) && (v & ~field_mask(field)))	\
+		__field_overflow();					\
 	return to((v & field_mask(field)) * field_multiplier(field));	\
 }									\
 static __always_inline __##type type##_replace_bits(__##type old,	\
-- 
cgit v1.2.3


From 37a3862e1238262e9866d5d66e5f5f9069cee3a1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 20 Jun 2018 08:58:29 +0200
Subject: bitfield: add u8 helpers

There's no reason why we shouldn't pack/unpack bits into/from
u8 values/registers/etc., so add u8 helpers.

Use the ____MAKE_OP() macro directly to avoid having nonsense
le8_encode_bits() and similar functions.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/bitfield.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 147a7bb341dd..65a6981eef7b 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -143,6 +143,7 @@ static __always_inline base type##_get_bits(__##type v, base field)	\
 	____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu)	\
 	____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu)	\
 	____MAKE_OP(u##size,u##size,,)
+____MAKE_OP(u8,u8,,)
 __MAKE_OP(16)
 __MAKE_OP(32)
 __MAKE_OP(64)
-- 
cgit v1.2.3


From 83d83bebf40132e2d55ec58af666713cc76f9764 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 28 Jun 2018 15:20:30 +0200
Subject: console/fbcon: Add support for deferred console takeover

Currently fbcon claims fbdevs as soon as they are registered and takes over
the console as soon as the first fbdev gets registered.

This behavior is undesirable in cases where a smooth graphical bootup is
desired, in such cases we typically want the contents of the framebuffer
(typically a vendor logo) to stay in place as is.

The current solution for this problem (on embedded systems) is to not
enable fbcon.

This commit adds a new FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER config option,
which when enabled defers fbcon taking over the console from the dummy
console until the first text is displayed on the console. Together with the
"quiet" kernel commandline option, this allows fbcon to still be used
together with a smooth graphical bootup, having it take over the console as
soon as e.g. an error message is logged.

Note the choice to detect the first console output in the dummycon driver,
rather then handling this entirely inside the fbcon code, was made after
2 failed attempts to handle this entirely inside the fbcon code. The fbcon
code is woven quite tightly into the console code, making this to only
feasible option.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 Documentation/fb/fbcon.txt       |  7 ++++
 drivers/video/console/Kconfig    | 11 ++++++
 drivers/video/console/dummycon.c | 67 ++++++++++++++++++++++++++++++++-----
 drivers/video/fbdev/core/fbcon.c | 72 ++++++++++++++++++++++++++++++++++++++++
 include/linux/console.h          |  5 +++
 5 files changed, 154 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt
index 79c22d096bbc..d4d642e1ce9c 100644
--- a/Documentation/fb/fbcon.txt
+++ b/Documentation/fb/fbcon.txt
@@ -155,6 +155,13 @@ C. Boot options
 	used by text. By default, this area will be black. The 'color' value
 	is an integer number that depends on the framebuffer driver being used.
 
+6. fbcon=nodefer
+
+	If the kernel is compiled with deferred fbcon takeover support, normally
+	the framebuffer contents, left in place by the firmware/bootloader, will
+	be preserved until there actually is some text is output to the console.
+	This option causes fbcon to bind immediately to the fbdev device.
+
 C. Attaching, Detaching and Unloading
 
 Before going on how to attach, detach and unload the framebuffer console, an
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index 4110ba7d7ca9..e91edef98633 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -150,6 +150,17 @@ config FRAMEBUFFER_CONSOLE_ROTATION
          such that other users of the framebuffer will remain normally
          oriented.
 
+config FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+	bool "Framebuffer Console Deferred Takeover"
+	depends on FRAMEBUFFER_CONSOLE=y && DUMMY_CONSOLE=y
+	help
+	  If enabled this defers the framebuffer console taking over the
+	  console from the dummy console until the first text is displayed on
+	  the console. This is useful in combination with the "quiet" kernel
+	  commandline option to keep the framebuffer contents initially put up
+	  by the firmware in place, rather then replacing the contents with a
+	  black screen as soon as fbcon loads.
+
 config STI_CONSOLE
         bool "STI text console"
 	depends on PARISC && HAS_IOMEM
diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c
index f2eafe2ed980..45ad925ad5f8 100644
--- a/drivers/video/console/dummycon.c
+++ b/drivers/video/console/dummycon.c
@@ -26,6 +26,65 @@
 #define DUMMY_ROWS	CONFIG_DUMMY_CONSOLE_ROWS
 #endif
 
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+/* These are both protected by the console_lock */
+static RAW_NOTIFIER_HEAD(dummycon_output_nh);
+static bool dummycon_putc_called;
+
+void dummycon_register_output_notifier(struct notifier_block *nb)
+{
+	raw_notifier_chain_register(&dummycon_output_nh, nb);
+
+	if (dummycon_putc_called)
+		nb->notifier_call(nb, 0, NULL);
+}
+
+void dummycon_unregister_output_notifier(struct notifier_block *nb)
+{
+	raw_notifier_chain_unregister(&dummycon_output_nh, nb);
+}
+
+static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos)
+{
+	dummycon_putc_called = true;
+	raw_notifier_call_chain(&dummycon_output_nh, 0, NULL);
+}
+
+static void dummycon_putcs(struct vc_data *vc, const unsigned short *s,
+			   int count, int ypos, int xpos)
+{
+	int i;
+
+	if (!dummycon_putc_called) {
+		/* Ignore erases */
+		for (i = 0 ; i < count; i++) {
+			if (s[i] != vc->vc_video_erase_char)
+				break;
+		}
+		if (i == count)
+			return;
+
+		dummycon_putc_called = true;
+	}
+
+	raw_notifier_call_chain(&dummycon_output_nh, 0, NULL);
+}
+
+static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch)
+{
+	/* Redraw, so that we get putc(s) for output done while blanked */
+	return 1;
+}
+#else
+static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos) { }
+static void dummycon_putcs(struct vc_data *vc, const unsigned short *s,
+			   int count, int ypos, int xpos) { }
+static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch)
+{
+	return 0;
+}
+#endif
+
 static const char *dummycon_startup(void)
 {
     return "dummy device";
@@ -44,9 +103,6 @@ static void dummycon_init(struct vc_data *vc, int init)
 static void dummycon_deinit(struct vc_data *vc) { }
 static void dummycon_clear(struct vc_data *vc, int sy, int sx, int height,
 			   int width) { }
-static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos) { }
-static void dummycon_putcs(struct vc_data *vc, const unsigned short *s,
-			   int count, int ypos, int xpos) { }
 static void dummycon_cursor(struct vc_data *vc, int mode) { }
 
 static bool dummycon_scroll(struct vc_data *vc, unsigned int top,
@@ -61,11 +117,6 @@ static int dummycon_switch(struct vc_data *vc)
 	return 0;
 }
 
-static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch)
-{
-	return 0;
-}
-
 static int dummycon_font_set(struct vc_data *vc, struct console_font *font,
 			     unsigned int flags)
 {
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index cd8d52a967aa..5fb156bdcf4e 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -129,6 +129,12 @@ static inline void fbcon_map_override(void)
 }
 #endif /* CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY */
 
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+static bool deferred_takeover = true;
+#else
+#define deferred_takeover false
+#endif
+
 /* font data */
 static char fontname[40];
 
@@ -499,6 +505,12 @@ static int __init fb_console_setup(char *this_opt)
 				margin_color = simple_strtoul(options, &options, 0);
 			continue;
 		}
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+		if (!strcmp(options, "nodefer")) {
+			deferred_takeover = false;
+			continue;
+		}
+#endif
 	}
 	return 1;
 }
@@ -3100,6 +3112,9 @@ static int fbcon_fb_unregistered(struct fb_info *info)
 
 	WARN_CONSOLE_UNLOCKED();
 
+	if (deferred_takeover)
+		return 0;
+
 	idx = info->node;
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] == idx)
@@ -3140,6 +3155,13 @@ static void fbcon_remap_all(int idx)
 
 	WARN_CONSOLE_UNLOCKED();
 
+	if (deferred_takeover) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++)
+			con2fb_map_boot[i] = idx;
+		fbcon_map_override();
+		return;
+	}
+
 	for (i = first_fb_vc; i <= last_fb_vc; i++)
 		set_con2fb_map(i, idx, 0);
 
@@ -3191,6 +3213,11 @@ static int fbcon_fb_registered(struct fb_info *info)
 	idx = info->node;
 	fbcon_select_primary(info);
 
+	if (deferred_takeover) {
+		pr_info("fbcon: Deferring console take-over\n");
+		return 0;
+	}
+
 	if (info_idx == -1) {
 		for (i = first_fb_vc; i <= last_fb_vc; i++) {
 			if (con2fb_map_boot[i] == idx) {
@@ -3566,8 +3593,46 @@ static int fbcon_init_device(void)
 	return 0;
 }
 
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+static struct notifier_block fbcon_output_nb;
+
+static int fbcon_output_notifier(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	int i;
+
+	WARN_CONSOLE_UNLOCKED();
+
+	pr_info("fbcon: Taking over console\n");
+
+	dummycon_unregister_output_notifier(&fbcon_output_nb);
+	deferred_takeover = false;
+	logo_shown = FBCON_LOGO_DONTSHOW;
+
+	for (i = 0; i < FB_MAX; i++) {
+		if (registered_fb[i])
+			fbcon_fb_registered(registered_fb[i]);
+	}
+
+	return NOTIFY_OK;
+}
+
+static void fbcon_register_output_notifier(void)
+{
+	fbcon_output_nb.notifier_call = fbcon_output_notifier;
+	dummycon_register_output_notifier(&fbcon_output_nb);
+}
+#else
+static inline void fbcon_register_output_notifier(void) {}
+#endif
+
 static void fbcon_start(void)
 {
+	if (deferred_takeover) {
+		fbcon_register_output_notifier();
+		return;
+	}
+
 	if (num_registered_fb) {
 		int i;
 
@@ -3594,6 +3659,13 @@ static void fbcon_exit(void)
 	if (fbcon_has_exited)
 		return;
 
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER
+	if (deferred_takeover) {
+		dummycon_unregister_output_notifier(&fbcon_output_nb);
+		deferred_takeover = false;
+	}
+#endif
+
 	kfree((void *)softback_buf);
 	softback_buf = 0UL;
 
diff --git a/include/linux/console.h b/include/linux/console.h
index dfd6b0e97855..f59f3dbca65c 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -21,6 +21,7 @@ struct console_font_op;
 struct console_font;
 struct module;
 struct tty_struct;
+struct notifier_block;
 
 /*
  * this is what the terminal answers to a ESC-Z or csi0c query.
@@ -220,4 +221,8 @@ static inline bool vgacon_text_force(void) { return false; }
 
 extern void console_init(void);
 
+/* For deferred console takeover */
+void dummycon_register_output_notifier(struct notifier_block *nb);
+void dummycon_unregister_output_notifier(struct notifier_block *nb);
+
 #endif /* _LINUX_CONSOLE_H */
-- 
cgit v1.2.3


From 4b09791ba059cc5a5ec7d69049f5d05da65b6418 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnáček <omosnace@redhat.com>
Date: Tue, 26 Jun 2018 13:04:42 +0200
Subject: cred: conditionally declare groups-related functions

The groups-related functions declared in include/linux/cred.h are
defined in kernel/groups.c, which is compiled only when
CONFIG_MULTIUSER=y. Move all these function declarations under #ifdef
CONFIG_MULTIUSER to help avoid accidental usage in contexts where
CONFIG_MULTIUSER might be disabled.

This patch also adds a fallback for groups_search(). Currently this
function is only called from kernel/groups.c itself and
security/keys/permissions.c, where the call is (by coincidence)
optimized away in case CONFIG_MULTIUSER=n. However, the audit subsystem
(which does not depend on CONFIG_MULTIUSER) calls this function in
-next, so the fallback will be needed to avoid compilation errors or
ugly workarounds.

See also:
https://lkml.org/lkml/2018/6/20/670
https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git/commit/?h=next&id=af85d1772e31fed34165a1b3decef340cf4080c0

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/cred.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 631286535d0f..7eed6101c791 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -65,6 +65,12 @@ extern void groups_free(struct group_info *);
 
 extern int in_group_p(kgid_t);
 extern int in_egroup_p(kgid_t);
+extern int groups_search(const struct group_info *, kgid_t);
+
+extern int set_current_groups(struct group_info *);
+extern void set_groups(struct cred *, struct group_info *);
+extern bool may_setgroups(void);
+extern void groups_sort(struct group_info *);
 #else
 static inline void groups_free(struct group_info *group_info)
 {
@@ -78,12 +84,11 @@ static inline int in_egroup_p(kgid_t grp)
 {
         return 1;
 }
+static inline int groups_search(const struct group_info *group_info, kgid_t grp)
+{
+	return 1;
+}
 #endif
-extern int set_current_groups(struct group_info *);
-extern void set_groups(struct cred *, struct group_info *);
-extern int groups_search(const struct group_info *, kgid_t);
-extern bool may_setgroups(void);
-extern void groups_sort(struct group_info *);
 
 /*
  * The security context of a task
-- 
cgit v1.2.3


From cfdfc14e7fb8ae77290e9d5afaeecc0a234a3846 Mon Sep 17 00:00:00 2001
From: Doug Meyer <dmeyer@gigaio.com>
Date: Wed, 23 May 2018 13:18:05 -0700
Subject: switchtec: Use generic PCI Vendor ID and Class Code

Move the Microsemi Switchtec PCI Vendor ID (same as
PCI_VENDOR_ID_PMC_Sierra) to pci_ids.h.   Also, replace Microsemi class
constants with the standard PCI definitions.

Signed-off-by: Doug Meyer <dmeyer@gigaio.com>
[bhelgaas: restore SPDX (I assume it was removed by mistake), remove
device ID definitions]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c |  3 ++-
 drivers/pci/switch/switchtec.c         | 14 +++++++-------
 include/linux/pci_ids.h                |  1 +
 include/linux/switchtec.h              |  4 ----
 4 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index f624ae27eabe..5ee5f40b4dfc 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -19,6 +19,7 @@
 #include <linux/kthread.h>
 #include <linux/interrupt.h>
 #include <linux/ntb.h>
+#include <linux/pci.h>
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
@@ -1487,7 +1488,7 @@ static int switchtec_ntb_add(struct device *dev,
 
 	stdev->sndev = NULL;
 
-	if (stdev->pdev->class != MICROSEMI_NTB_CLASSCODE)
+	if (stdev->pdev->class != (PCI_CLASS_BRIDGE_OTHER << 8))
 		return -ENODEV;
 
 	sndev = kzalloc_node(sizeof(*sndev), GFP_KERNEL, dev_to_node(dev));
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 47cd0c037433..9940cc70f38b 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -641,7 +641,7 @@ static int ioctl_event_summary(struct switchtec_dev *stdev,
 
 	for (i = 0; i < SWITCHTEC_MAX_PFF_CSR; i++) {
 		reg = ioread16(&stdev->mmio_pff_csr[i].vendor_id);
-		if (reg != MICROSEMI_VENDOR_ID)
+		if (reg != PCI_VENDOR_ID_MICROSEMI)
 			break;
 
 		reg = ioread32(&stdev->mmio_pff_csr[i].pff_event_summary);
@@ -1203,7 +1203,7 @@ static void init_pff(struct switchtec_dev *stdev)
 
 	for (i = 0; i < SWITCHTEC_MAX_PFF_CSR; i++) {
 		reg = ioread16(&stdev->mmio_pff_csr[i].vendor_id);
-		if (reg != MICROSEMI_VENDOR_ID)
+		if (reg != PCI_VENDOR_ID_MICROSEMI)
 			break;
 	}
 
@@ -1267,7 +1267,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev,
 	struct switchtec_dev *stdev;
 	int rc;
 
-	if (pdev->class == MICROSEMI_NTB_CLASSCODE)
+	if (pdev->class == (PCI_CLASS_BRIDGE_OTHER << 8))
 		request_module_nowait("ntb_hw_switchtec");
 
 	stdev = stdev_create(pdev);
@@ -1321,19 +1321,19 @@ static void switchtec_pci_remove(struct pci_dev *pdev)
 
 #define SWITCHTEC_PCI_DEVICE(device_id) \
 	{ \
-		.vendor     = MICROSEMI_VENDOR_ID, \
+		.vendor     = PCI_VENDOR_ID_MICROSEMI, \
 		.device     = device_id, \
 		.subvendor  = PCI_ANY_ID, \
 		.subdevice  = PCI_ANY_ID, \
-		.class      = MICROSEMI_MGMT_CLASSCODE, \
+		.class      = (PCI_CLASS_MEMORY_OTHER << 8), \
 		.class_mask = 0xFFFFFFFF, \
 	}, \
 	{ \
-		.vendor     = MICROSEMI_VENDOR_ID, \
+		.vendor     = PCI_VENDOR_ID_MICROSEMI, \
 		.device     = device_id, \
 		.subvendor  = PCI_ANY_ID, \
 		.subdevice  = PCI_ANY_ID, \
-		.class      = MICROSEMI_NTB_CLASSCODE, \
+		.class      = (PCI_CLASS_BRIDGE_OTHER << 8), \
 		.class_mask = 0xFFFFFFFF, \
 	}
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 29502238e510..80aec5b9a6c1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1668,6 +1668,7 @@
 #define PCI_DEVICE_ID_COMPEX_ENET100VG4	0x0112
 
 #define PCI_VENDOR_ID_PMC_Sierra	0x11f8
+#define PCI_VENDOR_ID_MICROSEMI		0x11f8
 
 #define PCI_VENDOR_ID_RP		0x11fe
 #define PCI_DEVICE_ID_RP32INTF		0x0001
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index ec93e93371fa..ab400af6f0ce 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -19,10 +19,6 @@
 #include <linux/pci.h>
 #include <linux/cdev.h>
 
-#define MICROSEMI_VENDOR_ID         0x11f8
-#define MICROSEMI_NTB_CLASSCODE     0x068000
-#define MICROSEMI_MGMT_CLASSCODE    0x058000
-
 #define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
 #define SWITCHTEC_MAX_PFF_CSR 48
 
-- 
cgit v1.2.3


From 783e84961b1d7a75045383586b8c49e02b7704cd Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 29 Jun 2018 21:17:26 -0500
Subject: PCI: Make pci_get_rom_size() static

pci_get_rom_size() is called only from pci_map_rom(), so it can be static.
Make it static and remove the declaration from include/linux/pci.h.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/rom.c   | 3 ++-
 include/linux/pci.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index 3a33f5ce314a..137bf0cee897 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -80,7 +80,8 @@ EXPORT_SYMBOL_GPL(pci_disable_rom);
  * The PCI window size could be much larger than the
  * actual image size.
  */
-size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size)
+static size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom,
+			       size_t size)
 {
 	void __iomem *image;
 	int last_image;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..f40d3e05ccd1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1122,7 +1122,6 @@ int pci_enable_rom(struct pci_dev *pdev);
 void pci_disable_rom(struct pci_dev *pdev);
 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
-size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size);
 void __iomem __must_check *pci_platform_rom(struct pci_dev *pdev, size_t *size);
 
 /* Power management related routines */
-- 
cgit v1.2.3


From 7ce3f912ae0a79e5d738a3ae1f158b281973e849 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Sat, 30 Jun 2018 11:24:24 -0400
Subject: PCI: Enable PASID only if entire path supports End-End TLP prefixes

A PCIe endpoint carries the process address space identifier (PASID) in
the TLP prefix as part of the memory read/write transaction. The address
information in the TLP is relevant only for a given PASID context.

An IOMMU takes PASID value and the address information from the
TLP to look up the physical address in the system.

PASID is an End-End TLP Prefix (PCIe r4.0, sec 6.20).  Sec 2.2.10.2 says

  It is an error to receive a TLP with an End-End TLP Prefix by a
  Receiver that does not support End-End TLP Prefixes. A TLP in
  violation of this rule is handled as a Malformed TLP. This is a
  reported error associated with the Receiving Port (see Section 6.2).

Prevent error condition by proactively requiring End-End TLP prefix to be
supported on the entire data path between the endpoint and the root port
before enabling PASID.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/ats.c             |  3 +++
 drivers/pci/probe.c           | 24 ++++++++++++++++++++++++
 include/linux/pci.h           |  1 +
 include/uapi/linux/pci_regs.h |  1 +
 4 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 4923a2a8e14b..5b78f3b1b918 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -273,6 +273,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 	if (WARN_ON(pdev->pasid_enabled))
 		return -EBUSY;
 
+	if (!pdev->eetlp_prefix_path)
+		return -EINVAL;
+
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
 	if (!pos)
 		return -EINVAL;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac876e32de4b..4c35c2909d57 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2042,6 +2042,29 @@ static void pci_configure_ltr(struct pci_dev *dev)
 #endif
 }
 
+static void pci_configure_eetlp_prefix(struct pci_dev *dev)
+{
+#ifdef CONFIG_PCI_PASID
+	struct pci_dev *bridge;
+	u32 cap;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP2, &cap);
+	if (!(cap & PCI_EXP_DEVCAP2_EE_PREFIX))
+		return;
+
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+		dev->eetlp_prefix_path = 1;
+	else {
+		bridge = pci_upstream_bridge(dev);
+		if (bridge && bridge->eetlp_prefix_path)
+			dev->eetlp_prefix_path = 1;
+	}
+#endif
+}
+
 static void pci_configure_device(struct pci_dev *dev)
 {
 	struct hotplug_params hpp;
@@ -2051,6 +2074,7 @@ static void pci_configure_device(struct pci_dev *dev)
 	pci_configure_extended_tags(dev, NULL);
 	pci_configure_relaxed_ordering(dev);
 	pci_configure_ltr(dev);
+	pci_configure_eetlp_prefix(dev);
 
 	memset(&hpp, 0, sizeof(hpp));
 	ret = pci_get_hp_params(dev, &hpp);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..6ba818449095 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -350,6 +350,7 @@ struct pci_dev {
 	unsigned int	ltr_path:1;	/* Latency Tolerance Reporting
 					   supported from root to here */
 #endif
+	unsigned int	eetlp_prefix_path:1;	/* End-to-End TLP Prefix */
 
 	pci_channel_state_t error_state;	/* Current connectivity state */
 	struct device	dev;			/* Generic device interface */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 4da87e2ef8a8..04d7480db714 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -636,6 +636,7 @@
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
+#define PCI_EXP_DEVCAP2_EE_PREFIX	0x00200000 /* End-End TLP Prefix */
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
 #define  PCI_EXP_DEVCTL2_COMP_TMOUT_DIS	0x0010	/* Completion Timeout Disable */
-- 
cgit v1.2.3


From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 29 Jun 2018 21:26:41 -0700
Subject: net: Refactor XPS for CPUs and Rx queues

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h   |  11 ++-
 include/linux/netdevice.h |  98 ++++++++++++++++++++-
 net/core/dev.c            | 211 ++++++++++++++++++++++++++++++----------------
 net/core/net-sysfs.c      |   4 +-
 4 files changed, 244 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d893ad02..57f20a0a7794 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)		((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+	WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+	cpu_max_bits_warn(cpu, nr_cpumask_bits);
 	return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a15869..8bf8d6149f79 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -731,10 +731,15 @@ struct xps_map {
  */
 struct xps_dev_maps {
 	struct rcu_head rcu;
-	struct xps_map __rcu *cpu_map[0];
+	struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +		\
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +	\
 	(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+	(_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE	16
@@ -1910,7 +1915,8 @@ struct net_device {
 	int			watchdog_timeo;
 
 #ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_maps;
+	struct xps_dev_maps __rcu *xps_cpus_map;
+	struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
@@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+			  u16 index, bool is_rxqs_map);
+
+/**
+ *	netif_attr_test_mask - Test a CPU or Rx queue set in a mask
+ *	@j: CPU/Rx queue index
+ *	@mask: bitmask of all cpus/rx queues
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
+ */
+static inline bool netif_attr_test_mask(unsigned long j,
+					const unsigned long *mask,
+					unsigned int nr_bits)
+{
+	cpu_max_bits_warn(j, nr_bits);
+	return test_bit(j, mask);
+}
+
+/**
+ *	netif_attr_test_online - Test for online CPU/Rx queue
+ *	@j: CPU/Rx queue index
+ *	@online_mask: bitmask for CPUs/Rx queues that are online
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns true if a CPU/Rx queue is online.
+ */
+static inline bool netif_attr_test_online(unsigned long j,
+					  const unsigned long *online_mask,
+					  unsigned int nr_bits)
+{
+	cpu_max_bits_warn(j, nr_bits);
+
+	if (online_mask)
+		return test_bit(j, online_mask);
+
+	return (j < nr_bits);
+}
+
+/**
+ *	netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
+ *	@n: CPU/Rx queue index
+ *	@srcp: the cpumask/Rx queue mask pointer
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set.
+ */
+static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
+					       unsigned int nr_bits)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpu_max_bits_warn(n, nr_bits);
+
+	if (srcp)
+		return find_next_bit(srcp, nr_bits, n + 1);
+
+	return n + 1;
+}
+
+/**
+ *	netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
+ *	@n: CPU/Rx queue index
+ *	@src1p: the first CPUs/Rx queues mask pointer
+ *	@src2p: the second CPUs/Rx queues mask pointer
+ *	@nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set in both.
+ */
+static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
+					  const unsigned long *src2p,
+					  unsigned int nr_bits)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpu_max_bits_warn(n, nr_bits);
+
+	if (src1p && src2p)
+		return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+	else if (src1p)
+		return find_next_bit(src1p, nr_bits, n + 1);
+	else if (src2p)
+		return find_next_bit(src2p, nr_bits, n + 1);
+
+	return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
 				      const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index dffed642e686..71059558dc39 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 	int pos;
 
 	if (dev_maps)
-		map = xmap_dereference(dev_maps->cpu_map[tci]);
+		map = xmap_dereference(dev_maps->attr_map[tci]);
 	if (!map)
 		return false;
 
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 			break;
 		}
 
-		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 		kfree_rcu(map, rcu);
 		return false;
 	}
@@ -2135,31 +2135,58 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 	return active;
 }
 
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+			   u16 offset, u16 count, bool is_rxqs_map)
+{
+	bool active = false;
+	int i, j;
+
+	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+	     j < nr_ids;)
+		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+					       count);
+	if (!active) {
+		if (is_rxqs_map) {
+			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+		} else {
+			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+
+			for (i = offset + (count - 1); count--; i--)
+				netdev_queue_numa_node_write(
+					netdev_get_tx_queue(dev, i),
+							NUMA_NO_NODE);
+		}
+		kfree_rcu(dev_maps, rcu);
+	}
+}
+
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 				   u16 count)
 {
+	const unsigned long *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps;
-	int cpu, i;
-	bool active = false;
+	unsigned int nr_ids;
 
 	mutex_lock(&xps_map_mutex);
-	dev_maps = xmap_dereference(dev->xps_maps);
 
-	if (!dev_maps)
-		goto out_no_maps;
-
-	for_each_possible_cpu(cpu)
-		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
-					       offset, count);
+	dev_maps = xmap_dereference(dev->xps_rxqs_map);
+	if (dev_maps) {
+		nr_ids = dev->num_rx_queues;
+		clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
+			       count, true);
 
-	if (!active) {
-		RCU_INIT_POINTER(dev->xps_maps, NULL);
-		kfree_rcu(dev_maps, rcu);
 	}
 
-	for (i = offset + (count - 1); count--; i--)
-		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
-					     NUMA_NO_NODE);
+	dev_maps = xmap_dereference(dev->xps_cpus_map);
+	if (!dev_maps)
+		goto out_no_maps;
+
+	if (num_possible_cpus() > 1)
+		possible_mask = cpumask_bits(cpu_possible_mask);
+	nr_ids = nr_cpu_ids;
+	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+		       false);
 
 out_no_maps:
 	mutex_unlock(&xps_map_mutex);
@@ -2170,8 +2197,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 }
 
-static struct xps_map *expand_xps_map(struct xps_map *map,
-				      int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+				      u16 index, bool is_rxqs_map)
 {
 	struct xps_map *new_map;
 	int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2210,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 		return map;
 	}
 
-	/* Need to add queue to this CPU's existing map */
+	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 	if (map) {
 		if (pos < map->alloc_len)
 			return map;
@@ -2191,9 +2218,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 		alloc_len = map->alloc_len * 2;
 	}
 
-	/* Need to allocate new map to store queue on this CPU's map */
-	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
-			       cpu_to_node(cpu));
+	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+	 *  map
+	 */
+	if (is_rxqs_map)
+		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+	else
+		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+				       cpu_to_node(attr_index));
 	if (!new_map)
 		return NULL;
 
@@ -2205,14 +2237,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
 	return new_map;
 }
 
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
-			u16 index)
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+			  u16 index, bool is_rxqs_map)
 {
+	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
-	int i, cpu, tci, numa_node_id = -2;
+	int i, j, tci, numa_node_id = -2;
 	int maps_sz, num_tc = 1, tc = 0;
 	struct xps_map *map, *new_map;
 	bool active = false;
+	unsigned int nr_ids;
 
 	if (dev->num_tc) {
 		num_tc = dev->num_tc;
@@ -2221,16 +2255,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			return -EINVAL;
 	}
 
-	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
-	if (maps_sz < L1_CACHE_BYTES)
-		maps_sz = L1_CACHE_BYTES;
-
 	mutex_lock(&xps_map_mutex);
+	if (is_rxqs_map) {
+		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+		dev_maps = xmap_dereference(dev->xps_rxqs_map);
+		nr_ids = dev->num_rx_queues;
+	} else {
+		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+		if (num_possible_cpus() > 1) {
+			online_mask = cpumask_bits(cpu_online_mask);
+			possible_mask = cpumask_bits(cpu_possible_mask);
+		}
+		dev_maps = xmap_dereference(dev->xps_cpus_map);
+		nr_ids = nr_cpu_ids;
+	}
 
-	dev_maps = xmap_dereference(dev->xps_maps);
+	if (maps_sz < L1_CACHE_BYTES)
+		maps_sz = L1_CACHE_BYTES;
 
 	/* allocate memory for queue storage */
-	for_each_cpu_and(cpu, cpu_online_mask, mask) {
+	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+	     j < nr_ids;) {
 		if (!new_dev_maps)
 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 		if (!new_dev_maps) {
@@ -2238,73 +2283,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			return -ENOMEM;
 		}
 
-		tci = cpu * num_tc + tc;
-		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+		tci = j * num_tc + tc;
+		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 				 NULL;
 
-		map = expand_xps_map(map, cpu, index);
+		map = expand_xps_map(map, j, index, is_rxqs_map);
 		if (!map)
 			goto error;
 
-		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 	}
 
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
-	for_each_possible_cpu(cpu) {
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
 		/* copy maps belonging to foreign traffic classes */
-		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 
 		/* We need to explicitly update tci as prevous loop
 		 * could break out early if dev_maps is NULL.
 		 */
-		tci = cpu * num_tc + tc;
+		tci = j * num_tc + tc;
 
-		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
-			/* add queue to CPU maps */
+		if (netif_attr_test_mask(j, mask, nr_ids) &&
+		    netif_attr_test_online(j, online_mask, nr_ids)) {
+			/* add tx-queue to CPU/rx-queue maps */
 			int pos = 0;
 
-			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 			while ((pos < map->len) && (map->queues[pos] != index))
 				pos++;
 
 			if (pos == map->len)
 				map->queues[map->len++] = index;
 #ifdef CONFIG_NUMA
-			if (numa_node_id == -2)
-				numa_node_id = cpu_to_node(cpu);
-			else if (numa_node_id != cpu_to_node(cpu))
-				numa_node_id = -1;
+			if (!is_rxqs_map) {
+				if (numa_node_id == -2)
+					numa_node_id = cpu_to_node(j);
+				else if (numa_node_id != cpu_to_node(j))
+					numa_node_id = -1;
+			}
 #endif
 		} else if (dev_maps) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 
 		/* copy maps belonging to foreign traffic classes */
 		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 	}
 
-	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+	if (is_rxqs_map)
+		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+	else
+		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 
 	/* Cleanup old maps */
 	if (!dev_maps)
 		goto out_no_old_maps;
 
-	for_each_possible_cpu(cpu) {
-		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
-			map = xmap_dereference(dev_maps->cpu_map[tci]);
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = num_tc, tci = j * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+			map = xmap_dereference(dev_maps->attr_map[tci]);
 			if (map && map != new_map)
 				kfree_rcu(map, rcu);
 		}
@@ -2317,19 +2370,23 @@ out_no_old_maps:
 	active = true;
 
 out_no_new_maps:
-	/* update Tx queue numa node */
-	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
-				     (numa_node_id >= 0) ? numa_node_id :
-				     NUMA_NO_NODE);
+	if (!is_rxqs_map) {
+		/* update Tx queue numa node */
+		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+					     (numa_node_id >= 0) ?
+					     numa_node_id : NUMA_NO_NODE);
+	}
 
 	if (!dev_maps)
 		goto out_no_maps;
 
-	/* removes queue from unused CPUs */
-	for_each_possible_cpu(cpu) {
-		for (i = tc, tci = cpu * num_tc; i--; tci++)
+	/* removes tx-queue from unused CPUs/rx-queues */
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = tc, tci = j * num_tc; i--; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
-		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+		if (!netif_attr_test_mask(j, mask, nr_ids) ||
+		    !netif_attr_test_online(j, online_mask, nr_ids))
 			active |= remove_xps_queue(dev_maps, tci, index);
 		for (i = num_tc - tc, tci++; --i; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2394,10 @@ out_no_new_maps:
 
 	/* free map if not active */
 	if (!active) {
-		RCU_INIT_POINTER(dev->xps_maps, NULL);
+		if (is_rxqs_map)
+			RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+		else
+			RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 		kfree_rcu(dev_maps, rcu);
 	}
 
@@ -2347,11 +2407,12 @@ out_no_maps:
 	return 0;
 error:
 	/* remove any maps that we added */
-	for_each_possible_cpu(cpu) {
-		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
-			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+	     j < nr_ids;) {
+		for (i = num_tc, tci = j * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 			map = dev_maps ?
-			      xmap_dereference(dev_maps->cpu_map[tci]) :
+			      xmap_dereference(dev_maps->attr_map[tci]) :
 			      NULL;
 			if (new_map && new_map != map)
 				kfree(new_map);
@@ -2363,6 +2424,12 @@ error:
 	kfree(new_dev_maps);
 	return -ENOMEM;
 }
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+			u16 index)
+{
+	return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+}
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
@@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	int queue_index = -1;
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps);
+	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (dev_maps) {
 		unsigned int tci = skb->sender_cpu - 1;
 
@@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 			tci += netdev_get_prio_tc_map(dev, skb->priority);
 		}
 
-		map = rcu_dereference(dev_maps->cpu_map[tci]);
+		map = rcu_dereference(dev_maps->attr_map[tci]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bb7e80f4ced3..b39987c81d53 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 		return -ENOMEM;
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps);
+	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (dev_maps) {
 		for_each_possible_cpu(cpu) {
 			int i, tci = cpu * num_tc + tc;
 			struct xps_map *map;
 
-			map = rcu_dereference(dev_maps->cpu_map[tci]);
+			map = rcu_dereference(dev_maps->attr_map[tci]);
 			if (!map)
 				continue;
 
-- 
cgit v1.2.3


From f308d7353d1f5c3ddedc784a367edc72fc51bbaa Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Sun, 24 Jun 2018 23:27:22 +0200
Subject: mtd: rawnand: add Reed-Solomon error correction algorithm

Add Reed-Solomon (RS) to the enumeration of ECC algorithms.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 Documentation/devicetree/bindings/mtd/nand.txt | 2 +-
 drivers/mtd/nand/raw/nand_base.c               | 1 +
 include/linux/mtd/rawnand.h                    | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mtd/nand.txt b/Documentation/devicetree/bindings/mtd/nand.txt
index 8bb11d809429..eaef8c657aa5 100644
--- a/Documentation/devicetree/bindings/mtd/nand.txt
+++ b/Documentation/devicetree/bindings/mtd/nand.txt
@@ -25,7 +25,7 @@ Optional NAND chip properties:
 		  Deprecated values:
 		  "soft_bch": use "soft" and nand-ecc-algo instead
 - nand-ecc-algo: string, algorithm of NAND ECC.
-		 Supported values are: "hamming", "bch".
+		 Valid values are: "hamming", "bch", "rs".
 - nand-bus-width : 8 or 16 bus width if not present 8
 - nand-on-flash-bbt: boolean to enable on flash bbt option if not present false
 
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index b01d15ec4c56..d0af5347f89d 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5777,6 +5777,7 @@ static int of_get_nand_ecc_mode(struct device_node *np)
 static const char * const nand_ecc_algos[] = {
 	[NAND_ECC_HAMMING]	= "hamming",
 	[NAND_ECC_BCH]		= "bch",
+	[NAND_ECC_RS]		= "rs",
 };
 
 static int of_get_nand_ecc_algo(struct device_node *np)
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 3e8ec3b8a39c..2d9cb7acbc3d 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -121,6 +121,7 @@ enum nand_ecc_algo {
 	NAND_ECC_UNKNOWN,
 	NAND_ECC_HAMMING,
 	NAND_ECC_BCH,
+	NAND_ECC_RS,
 };
 
 /*
-- 
cgit v1.2.3


From f922bd798bb94e0adcce26ddd811245443173268 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Sun, 24 Jun 2018 23:27:23 +0200
Subject: mtd: rawnand: add an option to specify NAND chip as a boot device

Allow to define a NAND chip as a boot device. This can be helpful
for the selection of the ECC algorithm and strength in case the boot
ROM supports only a subset of controller provided options.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 Documentation/devicetree/bindings/mtd/nand.txt | 4 ++++
 drivers/mtd/nand/raw/nand_base.c               | 3 +++
 include/linux/mtd/rawnand.h                    | 6 ++++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mtd/nand.txt b/Documentation/devicetree/bindings/mtd/nand.txt
index eaef8c657aa5..e949c778e983 100644
--- a/Documentation/devicetree/bindings/mtd/nand.txt
+++ b/Documentation/devicetree/bindings/mtd/nand.txt
@@ -43,6 +43,10 @@ Optional NAND chip properties:
 		     This is particularly useful when only the in-band area is
 		     used by the upper layers, and you want to make your NAND
 		     as reliable as possible.
+- nand-is-boot-medium: Whether the NAND chip is a boot medium. Drivers might use
+		       this information to select ECC algorithms supported by
+		       the boot ROM or similar restrictions.
+
 - nand-rb: shall contain the native Ready/Busy ids.
 
 The ECC strength and ECC step size properties define the correction capability
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index d0af5347f89d..1c633f80c3e1 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5859,6 +5859,9 @@ static int nand_dt_init(struct nand_chip *chip)
 	if (of_get_nand_bus_width(dn) == 16)
 		chip->options |= NAND_BUSWIDTH_16;
 
+	if (of_property_read_bool(dn, "nand-is-boot-medium"))
+		chip->options |= NAND_IS_BOOT_MEDIUM;
+
 	if (of_get_nand_on_flash_bbt(dn))
 		chip->bbt_options |= NAND_BBT_USE_FLASH;
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 2d9cb7acbc3d..80aeeca03f36 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -219,6 +219,12 @@ enum nand_ecc_algo {
  */
 #define NAND_WAIT_TCCS		0x00200000
 
+/*
+ * Whether the NAND chip is a boot medium. Drivers might use this information
+ * to select ECC algorithms supported by the boot ROM or similar restrictions.
+ */
+#define NAND_IS_BOOT_MEDIUM	0x00400000
+
 /* Options set by nand scan */
 /* Nand scan has allocated controller struct */
 #define NAND_CONTROLLER_ALLOC	0x80000000
-- 
cgit v1.2.3


From 00ce4e039ad5bded462931606c3063ff691964b7 Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 25 Jun 2018 10:44:44 +1200
Subject: mtd: rawnand: add manufacturer fixup for ONFI parameter page

This is called after the ONFI parameter page checksum is verified
and allows us to override the contents of the parameter page.

Suggested-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 4 ++++
 include/linux/mtd/rawnand.h      | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 1c633f80c3e1..f362c09a71d7 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5168,6 +5168,10 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 		}
 	}
 
+	if (chip->manufacturer.desc && chip->manufacturer.desc->ops &&
+	    chip->manufacturer.desc->ops->fixup_onfi_param_page)
+		chip->manufacturer.desc->ops->fixup_onfi_param_page(chip, p);
+
 	/* Check version */
 	val = le16_to_cpu(p->revision);
 	if (val & (1 << 5))
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 80aeeca03f36..dc4538b58b84 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -785,11 +785,15 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  *	  implementation) if any.
  * @cleanup: the ->init() function may have allocated resources, ->cleanup()
  *	     is here to let vendor specific code release those resources.
+ * @fixup_onfi_param_page: apply vendor specific fixups to the ONFI parameter
+ *			   page. This is called after the checksum is verified.
  */
 struct nand_manufacturer_ops {
 	void (*detect)(struct nand_chip *chip);
 	int (*init)(struct nand_chip *chip);
 	void (*cleanup)(struct nand_chip *chip);
+	void (*fixup_onfi_param_page)(struct nand_chip *chip,
+				      struct nand_onfi_params *p);
 };
 
 /**
-- 
cgit v1.2.3


From 872b71ff084ab125c68073d9e69acfd7095f2015 Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 25 Jun 2018 10:44:45 +1200
Subject: mtd: rawnand: add defines for ONFI version bits

Add defines for the ONFI version bits and use them in
nand_flash_detect_onfi().

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 10 +++++-----
 include/linux/mtd/rawnand.h      | 11 +++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index f362c09a71d7..dbf6e80e9ab5 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5174,15 +5174,15 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 
 	/* Check version */
 	val = le16_to_cpu(p->revision);
-	if (val & (1 << 5))
+	if (val & ONFI_VERSION_2_3)
 		chip->parameters.onfi.version = 23;
-	else if (val & (1 << 4))
+	else if (val & ONFI_VERSION_2_2)
 		chip->parameters.onfi.version = 22;
-	else if (val & (1 << 3))
+	else if (val & ONFI_VERSION_2_1)
 		chip->parameters.onfi.version = 21;
-	else if (val & (1 << 2))
+	else if (val & ONFI_VERSION_2_0)
 		chip->parameters.onfi.version = 20;
-	else if (val & (1 << 1))
+	else if (val & ONFI_VERSION_1_0)
 		chip->parameters.onfi.version = 10;
 
 	if (!chip->parameters.onfi.version) {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index dc4538b58b84..fbac4741da52 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -237,6 +237,17 @@ enum nand_ecc_algo {
 /* Keep gcc happy */
 struct nand_chip;
 
+/* ONFI version bits */
+#define ONFI_VERSION_1_0		BIT(1)
+#define ONFI_VERSION_2_0		BIT(2)
+#define ONFI_VERSION_2_1		BIT(3)
+#define ONFI_VERSION_2_2		BIT(4)
+#define ONFI_VERSION_2_3		BIT(5)
+#define ONFI_VERSION_3_0		BIT(6)
+#define ONFI_VERSION_3_1		BIT(7)
+#define ONFI_VERSION_3_2		BIT(8)
+#define ONFI_VERSION_4_0		BIT(9)
+
 /* ONFI features */
 #define ONFI_FEATURE_16_BIT_BUS		(1 << 0)
 #define ONFI_FEATURE_EXT_PARAM_PAGE	(1 << 7)
-- 
cgit v1.2.3


From 1b48b7202c43e41c9f05d39901567dbcad5dc307 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 3 May 2018 16:25:49 +0200
Subject: dma-fence: remove fill_driver_data callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Noticed while I was typing docs. Entirely unused.

v2: Remove reference in @timeline_value_str too. While at it clarify
why timeline_value_str has a fence parameter - we don't have an
explicit timeline structure unfortunately.

Cc: Eric Anholt <eric@anholt.net>
Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Christian König <christian.koenig@amd.com> (v1)
Cc: Christian König <christian.koenig@amd.com> (v1)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20180503142603.28513-2-daniel.vetter@ffwll.ch
---
 include/linux/dma-fence.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index eb9b05aa5aea..111aefe1c956 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -217,17 +217,6 @@ struct dma_fence_ops {
 	 */
 	void (*release)(struct dma_fence *fence);
 
-	/**
-	 * @fill_driver_data:
-	 *
-	 * Callback to fill in free-form debug info.
-	 *
-	 * Returns amount of bytes filled, or negative error on failure.
-	 *
-	 * This callback is optional.
-	 */
-	int (*fill_driver_data)(struct dma_fence *fence, void *data, int size);
-
 	/**
 	 * @fence_value_str:
 	 *
@@ -242,8 +231,9 @@ struct dma_fence_ops {
 	 * @timeline_value_str:
 	 *
 	 * Fills in the current value of the timeline as a string, like the
-	 * sequence number. This should match what @fill_driver_data prints for
-	 * the most recently signalled fence (assuming no delayed signalling).
+	 * sequence number. Note that the specific fence passed to this function
+	 * should not matter, drivers should only use it to look up the
+	 * corresponding timeline structures.
 	 */
 	void (*timeline_value_str)(struct dma_fence *fence,
 				   char *str, int size);
-- 
cgit v1.2.3


From c701317a3eb8c012364a8d468f20eabf6df1ad77 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 4 May 2018 16:10:34 +0200
Subject: dma-fence: Make ->enable_signaling optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many drivers have a trivial implementation for ->enable_signaling.
Let's make it optional by assuming that signalling is already
available when the callback isn't present.

v2: Don't do the trick to set the ENABLE_SIGNAL_BIT
unconditionally, it results in an expensive spinlock take for
everyone. Instead just check if the callback is present. Suggested by
Maarten.

Also move misplaced kerneldoc hunk to the right patch.

Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Christian König <christian.koenig@amd.com> (v1)
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20180504141034.27727-1-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-fence.c | 9 +++++----
 include/linux/dma-fence.h   | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 4edb9fd3cf47..dd01a1720be9 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -200,7 +200,8 @@ void dma_fence_enable_sw_signaling(struct dma_fence *fence)
 
 	if (!test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 			      &fence->flags) &&
-	    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) {
+	    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
+	    fence->ops->enable_signaling) {
 		trace_dma_fence_enable_signal(fence);
 
 		spin_lock_irqsave(fence->lock, flags);
@@ -260,7 +261,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb,
 
 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 		ret = -ENOENT;
-	else if (!was_set) {
+	else if (!was_set && fence->ops->enable_signaling) {
 		trace_dma_fence_enable_signal(fence);
 
 		if (!fence->ops->enable_signaling(fence)) {
@@ -388,7 +389,7 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout)
 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 		goto out;
 
-	if (!was_set) {
+	if (!was_set && fence->ops->enable_signaling) {
 		trace_dma_fence_enable_signal(fence);
 
 		if (!fence->ops->enable_signaling(fence)) {
@@ -560,7 +561,7 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	       spinlock_t *lock, u64 context, unsigned seqno)
 {
 	BUG_ON(!lock);
-	BUG_ON(!ops || !ops->wait || !ops->enable_signaling ||
+	BUG_ON(!ops || !ops->wait ||
 	       !ops->get_driver_name || !ops->get_timeline_name);
 
 	kref_init(&fence->refcount);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 111aefe1c956..c053d19e1e24 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -166,7 +166,8 @@ struct dma_fence_ops {
 	 * released when the fence is signalled (through e.g. the interrupt
 	 * handler).
 	 *
-	 * This callback is mandatory.
+	 * This callback is optional. If this callback is not present, then the
+	 * driver must always have signaling enabled.
 	 */
 	bool (*enable_signaling)(struct dma_fence *fence);
 
-- 
cgit v1.2.3


From 78c9c4dfbf8c04883941445a195276bb4bb92c76 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 Jun 2018 15:21:32 +0200
Subject: posix-timers: Sanitize overrun handling

The posix timer overrun handling is broken because the forwarding functions
can return a huge number of overruns which does not fit in an int. As a
consequence timer_getoverrun(2) and siginfo::si_overrun can turn into
random number generators.

The k_clock::timer_forward() callbacks return a 64 bit value now. Make
k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal
accounting is correct. 3Remove the temporary (int) casts.

Add a helper function which clamps the overrun value returned to user space
via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value
between 0 and INT_MAX. INT_MAX is an indicator for user space that the
overrun value has been clamped.

Reported-by: Team OWL337 <icytxw@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Link: https://lkml.kernel.org/r/20180626132705.018623573@linutronix.de
---
 include/linux/posix-timers.h   |  4 ++--
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 31 ++++++++++++++++++++-----------
 3 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index c85704fcdbd2..ee7e987ea1b4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -95,8 +95,8 @@ struct k_itimer {
 	clockid_t		it_clock;
 	timer_t			it_id;
 	int			it_active;
-	int			it_overrun;
-	int			it_overrun_last;
+	s64			it_overrun;
+	s64			it_overrun_last;
 	int			it_requeue_pending;
 	int			it_sigev_notify;
 	ktime_t			it_interval;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5a6251ac6f7a..562cc3891b57 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -85,7 +85,7 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 			continue;
 
 		timer->it.cpu.expires += incr;
-		timer->it_overrun += 1 << i;
+		timer->it_overrun += 1LL << i;
 		delta -= incr;
 	}
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index db1d65963a57..3ac7295306dc 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -283,6 +283,17 @@ static __init int init_posix_timers(void)
 }
 __initcall(init_posix_timers);
 
+/*
+ * The siginfo si_overrun field and the return value of timer_getoverrun(2)
+ * are of type int. Clamp the overrun value to INT_MAX
+ */
+static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
+{
+	s64 sum = timr->it_overrun_last + (s64)baseval;
+
+	return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
+}
+
 static void common_hrtimer_rearm(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
@@ -290,9 +301,8 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
 	if (!timr->it_interval)
 		return;
 
-	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
-						timer->base->get_time(),
-						timr->it_interval);
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
+					    timr->it_interval);
 	hrtimer_restart(timer);
 }
 
@@ -321,10 +331,10 @@ void posixtimer_rearm(struct siginfo *info)
 
 		timr->it_active = 1;
 		timr->it_overrun_last = timr->it_overrun;
-		timr->it_overrun = -1;
+		timr->it_overrun = -1LL;
 		++timr->it_requeue_pending;
 
-		info->si_overrun += timr->it_overrun_last;
+		info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
 	}
 
 	unlock_timer(timr, flags);
@@ -418,9 +428,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 					now = ktime_add(now, kj);
 			}
 #endif
-			timr->it_overrun += (unsigned int)
-				hrtimer_forward(timer, now,
-						timr->it_interval);
+			timr->it_overrun += hrtimer_forward(timer, now,
+							    timr->it_interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
 			timr->it_active = 1;
@@ -524,7 +533,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
 	new_timer->it_id = (timer_t) new_timer_id;
 	new_timer->it_clock = which_clock;
 	new_timer->kclock = kc;
-	new_timer->it_overrun = -1;
+	new_timer->it_overrun = -1LL;
 
 	if (event) {
 		rcu_read_lock();
@@ -702,7 +711,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	 * expiry time forward by intervals, so expiry is > now.
 	 */
 	if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
-		timr->it_overrun += (int)kc->timer_forward(timr, now);
+		timr->it_overrun += kc->timer_forward(timr, now);
 
 	remaining = kc->timer_remaining(timr, now);
 	/* Return 0 only, when the timer is expired and not pending */
@@ -791,7 +800,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	if (!timr)
 		return -EINVAL;
 
-	overrun = timr->it_overrun_last;
+	overrun = timer_overrun_to_int(timr, 0);
 	unlock_timer(timr, flags);
 
 	return overrun;
-- 
cgit v1.2.3


From 05739375f1c0a1048fea8b9c4cb54d9e4a891938 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Fri, 29 Jun 2018 14:59:40 +0200
Subject: ASoC: pxa-ssp: remove .set_pll() and .set_clkdiv() callbacks

The .set_pll() and .set_clkdiv() callbacks are considered legacy and should
not be used anymore. In order to support PXA boards on DT platforms, remove
them and let the code figure out the correct dividers and PLL base
frequencies itself.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h |   8 +++
 sound/soc/pxa/pxa-ssp.c    | 146 ++++++++++++++++++++++-----------------------
 2 files changed, 81 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 03a7ca46735b..13b4244d44c1 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -171,6 +171,14 @@
 #define SSACD_SCDB		(1 << 3)	/* SSPSYSCLK Divider Bypass */
 #define SSACD_ACPS(x)		((x) << 4)	/* Audio clock PLL select */
 #define SSACD_ACDS(x)		((x) << 0)	/* Audio clock divider select */
+#define SSACD_ACDS_1		(0)
+#define SSACD_ACDS_2		(1)
+#define SSACD_ACDS_4		(2)
+#define SSACD_ACDS_8		(3)
+#define SSACD_ACDS_16		(4)
+#define SSACD_ACDS_32		(5)
+#define SSACD_SCDB_4X		(0)
+#define SSACD_SCDB_1X		(1)
 #define SSACD_SCDX8		(1 << 7)	/* SYSCLK division ratio select */
 
 /* LPSS SSP */
diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c
index 01d54697ede4..f8339bb01251 100644
--- a/sound/soc/pxa/pxa-ssp.c
+++ b/sound/soc/pxa/pxa-ssp.c
@@ -41,6 +41,7 @@
  */
 struct ssp_priv {
 	struct ssp_device *ssp;
+	unsigned long ssp_clk;
 	unsigned int sysclk;
 	unsigned int dai_fmt;
 	unsigned int configured_dai_fmt;
@@ -192,21 +193,6 @@ static void pxa_ssp_set_scr(struct ssp_device *ssp, u32 div)
 	pxa_ssp_write_reg(ssp, SSCR0, sscr0);
 }
 
-/**
- * pxa_ssp_get_clkdiv - get SSP clock divider
- */
-static u32 pxa_ssp_get_scr(struct ssp_device *ssp)
-{
-	u32 sscr0 = pxa_ssp_read_reg(ssp, SSCR0);
-	u32 div;
-
-	if (ssp->type == PXA25x_SSP)
-		div = ((sscr0 >> 8) & 0xff) * 2 + 2;
-	else
-		div = ((sscr0 >> 8) & 0xfff) + 1;
-	return div;
-}
-
 /*
  * Set the SSP ports SYSCLK.
  */
@@ -262,67 +248,18 @@ static int pxa_ssp_set_dai_sysclk(struct snd_soc_dai *cpu_dai,
 	return 0;
 }
 
-/*
- * Set the SSP clock dividers.
- */
-static int pxa_ssp_set_dai_clkdiv(struct snd_soc_dai *cpu_dai,
-	int div_id, int div)
-{
-	struct ssp_priv *priv = snd_soc_dai_get_drvdata(cpu_dai);
-	struct ssp_device *ssp = priv->ssp;
-	int val;
-
-	switch (div_id) {
-	case PXA_SSP_AUDIO_DIV_ACDS:
-		val = (pxa_ssp_read_reg(ssp, SSACD) & ~0x7) | SSACD_ACDS(div);
-		pxa_ssp_write_reg(ssp, SSACD, val);
-		break;
-	case PXA_SSP_AUDIO_DIV_SCDB:
-		val = pxa_ssp_read_reg(ssp, SSACD);
-		val &= ~SSACD_SCDB;
-		if (ssp->type == PXA3xx_SSP)
-			val &= ~SSACD_SCDX8;
-		switch (div) {
-		case PXA_SSP_CLK_SCDB_1:
-			val |= SSACD_SCDB;
-			break;
-		case PXA_SSP_CLK_SCDB_4:
-			break;
-		case PXA_SSP_CLK_SCDB_8:
-			if (ssp->type == PXA3xx_SSP)
-				val |= SSACD_SCDX8;
-			else
-				return -EINVAL;
-			break;
-		default:
-			return -EINVAL;
-		}
-		pxa_ssp_write_reg(ssp, SSACD, val);
-		break;
-	case PXA_SSP_DIV_SCR:
-		pxa_ssp_set_scr(ssp, div);
-		break;
-	default:
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
 /*
  * Configure the PLL frequency pxa27x and (afaik - pxa320 only)
  */
-static int pxa_ssp_set_dai_pll(struct snd_soc_dai *cpu_dai, int pll_id,
-	int source, unsigned int freq_in, unsigned int freq_out)
+static int pxa_ssp_set_pll(struct ssp_priv *priv, unsigned int freq)
 {
-	struct ssp_priv *priv = snd_soc_dai_get_drvdata(cpu_dai);
 	struct ssp_device *ssp = priv->ssp;
 	u32 ssacd = pxa_ssp_read_reg(ssp, SSACD) & ~0x70;
 
 	if (ssp->type == PXA3xx_SSP)
 		pxa_ssp_write_reg(ssp, SSACDD, 0);
 
-	switch (freq_out) {
+	switch (freq) {
 	case 5622000:
 		break;
 	case 11345000:
@@ -353,7 +290,7 @@ static int pxa_ssp_set_dai_pll(struct snd_soc_dai *cpu_dai, int pll_id,
 			u64 tmp = 19968;
 
 			tmp *= 1000000;
-			do_div(tmp, freq_out);
+			do_div(tmp, freq);
 			val = tmp;
 
 			val = (val << 16) | 64;
@@ -363,7 +300,7 @@ static int pxa_ssp_set_dai_pll(struct snd_soc_dai *cpu_dai, int pll_id,
 
 			dev_dbg(&ssp->pdev->dev,
 				"Using SSACDD %x to supply %uHz\n",
-				val, freq_out);
+				val, freq);
 			break;
 		}
 
@@ -568,6 +505,24 @@ static int pxa_ssp_configure_dai_fmt(struct ssp_priv *priv)
 	return 0;
 }
 
+struct pxa_ssp_clock_mode {
+	int rate;
+	int pll;
+	u8 acds;
+	u8 scdb;
+};
+
+static const struct pxa_ssp_clock_mode pxa_ssp_clock_modes[] = {
+	{ .rate =  8000, .pll = 32842000, .acds = SSACD_ACDS_32, .scdb = SSACD_SCDB_4X },
+	{ .rate = 11025, .pll =  5622000, .acds = SSACD_ACDS_4,  .scdb = SSACD_SCDB_4X },
+	{ .rate = 16000, .pll = 32842000, .acds = SSACD_ACDS_16, .scdb = SSACD_SCDB_4X },
+	{ .rate = 22050, .pll =  5622000, .acds = SSACD_ACDS_2,  .scdb = SSACD_SCDB_4X },
+	{ .rate = 44100, .pll = 11345000, .acds = SSACD_ACDS_2,  .scdb = SSACD_SCDB_4X },
+	{ .rate = 48000, .pll = 12235000, .acds = SSACD_ACDS_2,  .scdb = SSACD_SCDB_4X },
+	{ .rate = 96000, .pll = 12235000, .acds = SSACD_ACDS_4,  .scdb = SSACD_SCDB_1X },
+	{}
+};
+
 /*
  * Set the SSP audio DMA parameters and sample size.
  * Can be called multiple times by oss emulation.
@@ -579,11 +534,12 @@ static int pxa_ssp_hw_params(struct snd_pcm_substream *substream,
 	struct ssp_priv *priv = snd_soc_dai_get_drvdata(cpu_dai);
 	struct ssp_device *ssp = priv->ssp;
 	int chn = params_channels(params);
-	u32 sscr0;
-	u32 sspsp;
+	u32 sscr0, sspsp;
 	int width = snd_pcm_format_physical_width(params_format(params));
 	int ttsa = pxa_ssp_read_reg(ssp, SSTSA) & 0xf;
 	struct snd_dmaengine_dai_dma_data *dma_data;
+	int rate = params_rate(params);
+	int bclk = rate * chn * (width / 8);
 	int ret;
 
 	dma_data = snd_soc_dai_get_dma_data(cpu_dai, substream);
@@ -623,11 +579,57 @@ static int pxa_ssp_hw_params(struct snd_pcm_substream *substream,
 	}
 	pxa_ssp_write_reg(ssp, SSCR0, sscr0);
 
+	if (sscr0 & SSCR0_ACS) {
+		ret = pxa_ssp_set_pll(priv, bclk);
+
+		/*
+		 * If we were able to generate the bclk directly,
+		 * all is fine. Otherwise, look up the closest rate
+		 * from the table and also set the dividers.
+		 */
+
+		if (ret < 0) {
+			const struct pxa_ssp_clock_mode *m;
+			int ssacd, acds;
+
+			for (m = pxa_ssp_clock_modes; m->rate; m++) {
+				if (m->rate == rate)
+					break;
+			}
+
+			if (!m->rate)
+				return -EINVAL;
+
+			acds = m->acds;
+
+			/* The values in the table are for 16 bits */
+			if (width == 32)
+				acds--;
+
+			ret = pxa_ssp_set_pll(priv, bclk);
+			if (ret < 0)
+				return ret;
+
+			ssacd = pxa_ssp_read_reg(ssp, SSACD);
+			ssacd &= ~(SSACD_ACDS(7) | SSACD_SCDB_1X);
+			ssacd |= SSACD_ACDS(m->acds);
+			ssacd |= m->scdb;
+			pxa_ssp_write_reg(ssp, SSACD, ssacd);
+		}
+	} else if (sscr0 & SSCR0_ECS) {
+		/*
+		 * For setups with external clocking, the PLL and its diviers
+		 * are not active. Instead, the SCR bits in SSCR0 can be used
+		 * to divide the clock.
+		 */
+		pxa_ssp_set_scr(ssp, bclk / rate);
+	}
+
 	switch (priv->dai_fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
 	case SND_SOC_DAIFMT_I2S:
 	       sspsp = pxa_ssp_read_reg(ssp, SSPSP);
 
-		if ((pxa_ssp_get_scr(ssp) == 4) && (width == 16)) {
+		if (((priv->sysclk / bclk) == 64) && (width == 16)) {
 			/* This is a special case where the bitclk is 64fs
 			 * and we're not dealing with 2*32 bits of audio
 			 * samples.
@@ -812,8 +814,6 @@ static const struct snd_soc_dai_ops pxa_ssp_dai_ops = {
 	.trigger	= pxa_ssp_trigger,
 	.hw_params	= pxa_ssp_hw_params,
 	.set_sysclk	= pxa_ssp_set_dai_sysclk,
-	.set_clkdiv	= pxa_ssp_set_dai_clkdiv,
-	.set_pll	= pxa_ssp_set_dai_pll,
 	.set_fmt	= pxa_ssp_set_dai_fmt,
 	.set_tdm_slot	= pxa_ssp_set_dai_tdm_slot,
 	.set_tristate	= pxa_ssp_set_dai_tristate,
-- 
cgit v1.2.3


From 88763a5cf80ca59a7c3bea32681ce8f697d9995f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 26 Jun 2018 12:53:29 +0200
Subject: powercap / idle_inject: Add an idle injection framework

Initially, the cpu_cooling device for ARM was changed by adding a new
policy inserting idle cycles. The intel_powerclamp driver does a
similar action.

Instead of implementing idle injections privately in the cpu_cooling
device, move the idle injection code in a dedicated framework and give
the opportunity to other frameworks to make use of it.

The framework relies on the smpboot kthreads which handles via its
main loop the common code for hotplugging and [un]parking.

This code was previously tested with the cpu cooling device and went
through several iterations. It results now in split code and API
exported in the header file. It was tested with the cpu cooling device
with success.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Rewrite of all comments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig       |  10 ++
 drivers/powercap/Makefile      |   1 +
 drivers/powercap/idle_inject.c | 356 +++++++++++++++++++++++++++++++++++++++++
 include/linux/idle_inject.h    |  29 ++++
 4 files changed, 396 insertions(+)
 create mode 100644 drivers/powercap/idle_inject.c
 create mode 100644 include/linux/idle_inject.h

(limited to 'include/linux')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 85727ef6ce8e..6ac27e5908f5 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -29,4 +29,14 @@ config INTEL_RAPL
 	  controller, CPU core (Power Plance 0), graphics uncore (Power Plane
 	  1), etc.
 
+config IDLE_INJECT
+	bool "Idle injection framework"
+	depends on CPU_IDLE
+	default n
+	help
+	  This enables support for the idle injection framework. It
+	  provides a way to force idle periods on a set of specified
+	  CPUs for power capping. Idle period can be injected
+	  synchronously on a set of specified CPUs or alternatively
+	  on a per CPU basis.
 endif
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 0a21ef31372b..1b328854b36e 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
 obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c
new file mode 100644
index 000000000000..24ff2a068978
--- /dev/null
+++ b/drivers/powercap/idle_inject.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * The idle injection framework provides a way to force CPUs to enter idle
+ * states for a specified fraction of time over a specified period.
+ *
+ * It relies on the smpboot kthreads feature providing common code for CPU
+ * hotplug and thread [un]parking.
+ *
+ * All of the kthreads used for idle injection are created at init time.
+ *
+ * Next, the users of the the idle injection framework provide a cpumask via
+ * its register function. The kthreads will be synchronized with respect to
+ * this cpumask.
+ *
+ * The idle + run duration is specified via separate helpers and that allows
+ * idle injection to be started.
+ *
+ * The idle injection kthreads will call play_idle() with the idle duration
+ * specified as per the above.
+ *
+ * After all of them have been woken up, a timer is set to start the next idle
+ * injection cycle.
+ *
+ * The timer interrupt handler will wake up the idle injection kthreads for
+ * all of the CPUs in the cpumask provided by the user.
+ *
+ * Idle injection is stopped synchronously and no leftover idle injection
+ * kthread activity after its completion is guaranteed.
+ *
+ * It is up to the user of this framework to provide a lock for higher-level
+ * synchronization to prevent race conditions like starting idle injection
+ * while unregistering from the framework.
+ */
+#define pr_fmt(fmt) "ii_dev: " fmt
+
+#include <linux/cpu.h>
+#include <linux/hrtimer.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include <uapi/linux/sched/types.h>
+
+/**
+ * struct idle_inject_thread - task on/off switch structure
+ * @tsk: task injecting the idle cycles
+ * @should_run: whether or not to run the task (for the smpboot kthread API)
+ */
+struct idle_inject_thread {
+	struct task_struct *tsk;
+	int should_run;
+};
+
+/**
+ * struct idle_inject_device - idle injection data
+ * @timer: idle injection period timer
+ * @idle_duration_ms: duration of CPU idle time to inject
+ * @run_duration_ms: duration of CPU run time to allow
+ * @cpumask: mask of CPUs affected by idle injection
+ */
+struct idle_inject_device {
+	struct hrtimer timer;
+	unsigned int idle_duration_ms;
+	unsigned int run_duration_ms;
+	unsigned long int cpumask[0];
+};
+
+static DEFINE_PER_CPU(struct idle_inject_thread, idle_inject_thread);
+static DEFINE_PER_CPU(struct idle_inject_device *, idle_inject_device);
+
+/**
+ * idle_inject_wakeup - Wake up idle injection threads
+ * @ii_dev: target idle injection device
+ *
+ * Every idle injection task associated with the given idle injection device
+ * and running on an online CPU will be woken up.
+ */
+static void idle_inject_wakeup(struct idle_inject_device *ii_dev)
+{
+	struct idle_inject_thread *iit;
+	unsigned int cpu;
+
+	for_each_cpu_and(cpu, to_cpumask(ii_dev->cpumask), cpu_online_mask) {
+		iit = per_cpu_ptr(&idle_inject_thread, cpu);
+		iit->should_run = 1;
+		wake_up_process(iit->tsk);
+	}
+}
+
+/**
+ * idle_inject_timer_fn - idle injection timer function
+ * @timer: idle injection hrtimer
+ *
+ * This function is called when the idle injection timer expires.  It wakes up
+ * idle injection tasks associated with the timer and they, in turn, invoke
+ * play_idle() to inject a specified amount of CPU idle time.
+ *
+ * Return: HRTIMER_RESTART.
+ */
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+	unsigned int duration_ms;
+	struct idle_inject_device *ii_dev =
+		container_of(timer, struct idle_inject_device, timer);
+
+	duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+	duration_ms += READ_ONCE(ii_dev->idle_duration_ms);
+
+	idle_inject_wakeup(ii_dev);
+
+	hrtimer_forward_now(timer, ms_to_ktime(duration_ms));
+
+	return HRTIMER_RESTART;
+}
+
+/**
+ * idle_inject_fn - idle injection work function
+ * @cpu: the CPU owning the task
+ *
+ * This function calls play_idle() to inject a specified amount of CPU idle
+ * time.
+ */
+static void idle_inject_fn(unsigned int cpu)
+{
+	struct idle_inject_device *ii_dev;
+	struct idle_inject_thread *iit;
+
+	ii_dev = per_cpu(idle_inject_device, cpu);
+	iit = per_cpu_ptr(&idle_inject_thread, cpu);
+
+	/*
+	 * Let the smpboot main loop know that the task should not run again.
+	 */
+	iit->should_run = 0;
+
+	play_idle(READ_ONCE(ii_dev->idle_duration_ms));
+}
+
+/**
+ * idle_inject_set_duration - idle and run duration update helper
+ * @run_duration_ms: CPU run time to allow in milliseconds
+ * @idle_duration_ms: CPU idle time to inject in milliseconds
+ */
+void idle_inject_set_duration(struct idle_inject_device *ii_dev,
+			      unsigned int run_duration_ms,
+			      unsigned int idle_duration_ms)
+{
+	if (run_duration_ms && idle_duration_ms) {
+		WRITE_ONCE(ii_dev->run_duration_ms, run_duration_ms);
+		WRITE_ONCE(ii_dev->idle_duration_ms, idle_duration_ms);
+	}
+}
+
+/**
+ * idle_inject_get_duration - idle and run duration retrieval helper
+ * @run_duration_ms: memory location to store the current CPU run time
+ * @idle_duration_ms: memory location to store the current CPU idle time
+ */
+void idle_inject_get_duration(struct idle_inject_device *ii_dev,
+			      unsigned int *run_duration_ms,
+			      unsigned int *idle_duration_ms)
+{
+	*run_duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+	*idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms);
+}
+
+/**
+ * idle_inject_start - start idle injections
+ * @ii_dev: idle injection control device structure
+ *
+ * The function starts idle injection by first waking up all of the idle
+ * injection kthreads associated with @ii_dev to let them inject CPU idle time
+ * sets up a timer to start the next idle injection period.
+ *
+ * Return: -EINVAL if the CPU idle or CPU run time is not set or 0 on success.
+ */
+int idle_inject_start(struct idle_inject_device *ii_dev)
+{
+	unsigned int idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms);
+	unsigned int run_duration_ms = READ_ONCE(ii_dev->run_duration_ms);
+
+	if (!idle_duration_ms || !run_duration_ms)
+		return -EINVAL;
+
+	pr_debug("Starting injecting idle cycles on CPUs '%*pbl'\n",
+		 cpumask_pr_args(to_cpumask(ii_dev->cpumask)));
+
+	idle_inject_wakeup(ii_dev);
+
+	hrtimer_start(&ii_dev->timer,
+		      ms_to_ktime(idle_duration_ms + run_duration_ms),
+		      HRTIMER_MODE_REL);
+
+	return 0;
+}
+
+/**
+ * idle_inject_stop - stops idle injections
+ * @ii_dev: idle injection control device structure
+ *
+ * The function stops idle injection and waits for the threads to finish work.
+ * If CPU idle time is being injected when this function runs, then it will
+ * wait until the end of the cycle.
+ *
+ * When it returns, there is no more idle injection kthread activity.  The
+ * kthreads are scheduled out and the periodic timer is off.
+ */
+void idle_inject_stop(struct idle_inject_device *ii_dev)
+{
+	struct idle_inject_thread *iit;
+	unsigned int cpu;
+
+	pr_debug("Stopping idle injection on CPUs '%*pbl'\n",
+		 cpumask_pr_args(to_cpumask(ii_dev->cpumask)));
+
+	hrtimer_cancel(&ii_dev->timer);
+
+	/*
+	 * Stopping idle injection requires all of the idle injection kthreads
+	 * associated with the given cpumask to be parked and stay that way, so
+	 * prevent CPUs from going online at this point.  Any CPUs going online
+	 * after the loop below will be covered by clearing the should_run flag
+	 * that will cause the smpboot main loop to schedule them out.
+	 */
+	cpu_hotplug_disable();
+
+	/*
+	 * Iterate over all (online + offline) CPUs here in case one of them
+	 * goes offline with the should_run flag set so as to prevent its idle
+	 * injection kthread from running when the CPU goes online again after
+	 * the ii_dev has been freed.
+	 */
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask)) {
+		iit = per_cpu_ptr(&idle_inject_thread, cpu);
+		iit->should_run = 0;
+
+		wait_task_inactive(iit->tsk, 0);
+	}
+
+	cpu_hotplug_enable();
+}
+
+/**
+ * idle_inject_setup - prepare the current task for idle injection
+ * @cpu: not used
+ *
+ * Called once, this function is in charge of setting the current task's
+ * scheduler parameters to make it an RT task.
+ */
+static void idle_inject_setup(unsigned int cpu)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+}
+
+/**
+ * idle_inject_should_run - function helper for the smpboot API
+ * @cpu: CPU the kthread is running on
+ *
+ * Return: whether or not the thread can run.
+ */
+static int idle_inject_should_run(unsigned int cpu)
+{
+	struct idle_inject_thread *iit =
+		per_cpu_ptr(&idle_inject_thread, cpu);
+
+	return iit->should_run;
+}
+
+/**
+ * idle_inject_register - initialize idle injection on a set of CPUs
+ * @cpumask: CPUs to be affected by idle injection
+ *
+ * This function creates an idle injection control device structure for the
+ * given set of CPUs and initializes the timer associated with it.  It does not
+ * start any injection cycles.
+ *
+ * Return: NULL if memory allocation fails, idle injection control device
+ * pointer on success.
+ */
+struct idle_inject_device *idle_inject_register(struct cpumask *cpumask)
+{
+	struct idle_inject_device *ii_dev;
+	int cpu, cpu_rb;
+
+	ii_dev = kzalloc(sizeof(*ii_dev) + cpumask_size(), GFP_KERNEL);
+	if (!ii_dev)
+		return NULL;
+
+	cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask);
+	hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	ii_dev->timer.function = idle_inject_timer_fn;
+
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask)) {
+
+		if (per_cpu(idle_inject_device, cpu)) {
+			pr_err("cpu%d is already registered\n", cpu);
+			goto out_rollback;
+		}
+
+		per_cpu(idle_inject_device, cpu) = ii_dev;
+	}
+
+	return ii_dev;
+
+out_rollback:
+	for_each_cpu(cpu_rb, to_cpumask(ii_dev->cpumask)) {
+		if (cpu == cpu_rb)
+			break;
+		per_cpu(idle_inject_device, cpu_rb) = NULL;
+	}
+
+	kfree(ii_dev);
+
+	return NULL;
+}
+
+/**
+ * idle_inject_unregister - unregister idle injection control device
+ * @ii_dev: idle injection control device to unregister
+ *
+ * The function stops idle injection for the given control device,
+ * unregisters its kthreads and frees memory allocated when that device was
+ * created.
+ */
+void idle_inject_unregister(struct idle_inject_device *ii_dev)
+{
+	unsigned int cpu;
+
+	idle_inject_stop(ii_dev);
+
+	for_each_cpu(cpu, to_cpumask(ii_dev->cpumask))
+		per_cpu(idle_inject_device, cpu) = NULL;
+
+	kfree(ii_dev);
+}
+
+static struct smp_hotplug_thread idle_inject_threads = {
+	.store = &idle_inject_thread.tsk,
+	.setup = idle_inject_setup,
+	.thread_fn = idle_inject_fn,
+	.thread_comm = "idle_inject/%u",
+	.thread_should_run = idle_inject_should_run,
+};
+
+static int __init idle_inject_init(void)
+{
+	return smpboot_register_percpu_thread(&idle_inject_threads);
+}
+early_initcall(idle_inject_init);
diff --git a/include/linux/idle_inject.h b/include/linux/idle_inject.h
new file mode 100644
index 000000000000..bdc0293fb6cb
--- /dev/null
+++ b/include/linux/idle_inject.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ */
+#ifndef __IDLE_INJECT_H__
+#define __IDLE_INJECT_H__
+
+/* private idle injection device structure */
+struct idle_inject_device;
+
+struct idle_inject_device *idle_inject_register(struct cpumask *cpumask);
+
+void idle_inject_unregister(struct idle_inject_device *ii_dev);
+
+int idle_inject_start(struct idle_inject_device *ii_dev);
+
+void idle_inject_stop(struct idle_inject_device *ii_dev);
+
+void idle_inject_set_duration(struct idle_inject_device *ii_dev,
+				 unsigned int run_duration_ms,
+				 unsigned int idle_duration_ms);
+
+void idle_inject_get_duration(struct idle_inject_device *ii_dev,
+				 unsigned int *run_duration_ms,
+				 unsigned int *idle_duration_ms);
+#endif /* __IDLE_INJECT_H__ */
-- 
cgit v1.2.3


From a7ca13826e478f9b201eb2f9f20de0b978a82ad9 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 29 Jun 2018 14:11:19 +1000
Subject: gpio: aspeed: Add interfaces for co-processor to grab GPIOs

On the Aspeed chip, the GPIOs can be under control of the ARM
chip or of the ColdFire coprocessor. (There's a third command
source, the LPC bus, which we don't use or support yet).

The control of which master is allowed to modify a given
GPIO is per-bank (8 GPIOs).

Unfortunately, systems already exist for which we want to
use GPIOs of both sources in the same bank.

This provides an API exported by the gpio-aspeed driver
that an aspeed coprocessor driver can use to "grab" some
GPIOs for use by the coprocessor, and allow the coprocessor
driver to provide callbacks for arbitrating access.

Once at least one GPIO of a given bank has been "grabbed"
by the coprocessor, the entire bank is marked as being
under coprocessor control. It's command source is switched
to the coprocessor.

If the ARM then tries to write to a GPIO in such a marked bank,
the provided callbacks are used to request access from the
coprocessor driver, which is responsible to doing whatever
is necessary to "pause" the coprocessor or prevent it from
trying to use the GPIOs while the ARM is doing its accesses.

During that time, the command source for the bank is temporarily
switched back to the ARM.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Reviewed-by: Andrew Jeffery <andrew@aj.id.au>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-aspeed.c  | 251 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/gpio/aspeed.h |  15 +++
 2 files changed, 246 insertions(+), 20 deletions(-)
 create mode 100644 include/linux/gpio/aspeed.h

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index b3968f66b1d2..1e00f4045f9d 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -12,6 +12,7 @@
 #include <asm/div64.h>
 #include <linux/clk.h>
 #include <linux/gpio/driver.h>
+#include <linux/gpio/aspeed.h>
 #include <linux/hashtable.h>
 #include <linux/init.h>
 #include <linux/io.h>
@@ -22,6 +23,15 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 
+/*
+ * These two headers aren't meant to be used by GPIO drivers. We need
+ * them in order to access gpio_chip_hwgpio() which we need to implement
+ * the aspeed specific API which allows the coprocessor to request
+ * access to some GPIOs and to arbitrate between coprocessor and ARM.
+ */
+#include <linux/gpio/consumer.h>
+#include "gpiolib.h"
+
 struct aspeed_bank_props {
 	unsigned int bank;
 	u32 input;
@@ -56,6 +66,7 @@ struct aspeed_gpio {
 	struct clk *clk;
 
 	u32 *dcache;
+	u8 *cf_copro_bankmap;
 };
 
 struct aspeed_gpio_bank {
@@ -83,6 +94,9 @@ struct aspeed_gpio_bank {
 
 static const int debounce_timers[4] = { 0x00, 0x50, 0x54, 0x58 };
 
+static const struct aspeed_gpio_copro_ops *copro_ops;
+static void *copro_data;
+
 static const struct aspeed_gpio_bank aspeed_gpio_banks[] = {
 	{
 		.val_regs = 0x0000,
@@ -323,6 +337,50 @@ static void aspeed_gpio_change_cmd_source(struct aspeed_gpio *gpio,
 	iowrite32(reg, c0);
 }
 
+static bool aspeed_gpio_copro_request(struct aspeed_gpio *gpio,
+				      unsigned int offset)
+{
+	const struct aspeed_gpio_bank *bank = to_bank(offset);
+
+	if (!copro_ops || !gpio->cf_copro_bankmap)
+		return false;
+	if (!gpio->cf_copro_bankmap[offset >> 3])
+		return false;
+	if (!copro_ops->request_access)
+		return false;
+
+	/* Pause the coprocessor */
+	copro_ops->request_access(copro_data);
+
+	/* Change command source back to ARM */
+	aspeed_gpio_change_cmd_source(gpio, bank, offset >> 3, GPIO_CMDSRC_ARM);
+
+	/* Update cache */
+	gpio->dcache[GPIO_BANK(offset)] = ioread32(bank_reg(gpio, bank, reg_rdata));
+
+	return true;
+}
+
+static void aspeed_gpio_copro_release(struct aspeed_gpio *gpio,
+				      unsigned int offset)
+{
+	const struct aspeed_gpio_bank *bank = to_bank(offset);
+
+	if (!copro_ops || !gpio->cf_copro_bankmap)
+		return;
+	if (!gpio->cf_copro_bankmap[offset >> 3])
+		return;
+	if (!copro_ops->release_access)
+		return;
+
+	/* Change command source back to ColdFire */
+	aspeed_gpio_change_cmd_source(gpio, bank, offset >> 3,
+				      GPIO_CMDSRC_COLDFIRE);
+
+	/* Restart the coprocessor */
+	copro_ops->release_access(copro_data);
+}
+
 static int aspeed_gpio_get(struct gpio_chip *gc, unsigned int offset)
 {
 	struct aspeed_gpio *gpio = gpiochip_get_data(gc);
@@ -356,11 +414,15 @@ static void aspeed_gpio_set(struct gpio_chip *gc, unsigned int offset,
 {
 	struct aspeed_gpio *gpio = gpiochip_get_data(gc);
 	unsigned long flags;
+	bool copro;
 
 	spin_lock_irqsave(&gpio->lock, flags);
+	copro = aspeed_gpio_copro_request(gpio, offset);
 
 	__aspeed_gpio_set(gc, offset, val);
 
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 }
 
@@ -368,7 +430,9 @@ static int aspeed_gpio_dir_in(struct gpio_chip *gc, unsigned int offset)
 {
 	struct aspeed_gpio *gpio = gpiochip_get_data(gc);
 	const struct aspeed_gpio_bank *bank = to_bank(offset);
+	void __iomem *addr = bank_reg(gpio, bank, reg_dir);
 	unsigned long flags;
+	bool copro;
 	u32 reg;
 
 	if (!have_input(gpio, offset))
@@ -376,8 +440,13 @@ static int aspeed_gpio_dir_in(struct gpio_chip *gc, unsigned int offset)
 
 	spin_lock_irqsave(&gpio->lock, flags);
 
-	reg = ioread32(bank_reg(gpio, bank, reg_dir));
-	iowrite32(reg & ~GPIO_BIT(offset), bank_reg(gpio, bank, reg_dir));
+	reg = ioread32(addr);
+	reg &= ~GPIO_BIT(offset);
+
+	copro = aspeed_gpio_copro_request(gpio, offset);
+	iowrite32(reg, addr);
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 
 	spin_unlock_irqrestore(&gpio->lock, flags);
 
@@ -389,7 +458,9 @@ static int aspeed_gpio_dir_out(struct gpio_chip *gc,
 {
 	struct aspeed_gpio *gpio = gpiochip_get_data(gc);
 	const struct aspeed_gpio_bank *bank = to_bank(offset);
+	void __iomem *addr = bank_reg(gpio, bank, reg_dir);
 	unsigned long flags;
+	bool copro;
 	u32 reg;
 
 	if (!have_output(gpio, offset))
@@ -397,10 +468,15 @@ static int aspeed_gpio_dir_out(struct gpio_chip *gc,
 
 	spin_lock_irqsave(&gpio->lock, flags);
 
+	reg = ioread32(addr);
+	reg |= GPIO_BIT(offset);
+
+	copro = aspeed_gpio_copro_request(gpio, offset);
 	__aspeed_gpio_set(gc, offset, val);
-	reg = ioread32(bank_reg(gpio, bank, reg_dir));
-	iowrite32(reg | GPIO_BIT(offset), bank_reg(gpio, bank, reg_dir));
+	iowrite32(reg, addr);
 
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 
 	return 0;
@@ -430,24 +506,23 @@ static int aspeed_gpio_get_direction(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int irqd_to_aspeed_gpio_data(struct irq_data *d,
-		struct aspeed_gpio **gpio,
-		const struct aspeed_gpio_bank **bank,
-		u32 *bit)
+					   struct aspeed_gpio **gpio,
+					   const struct aspeed_gpio_bank **bank,
+					   u32 *bit, int *offset)
 {
-	int offset;
 	struct aspeed_gpio *internal;
 
-	offset = irqd_to_hwirq(d);
+	*offset = irqd_to_hwirq(d);
 
 	internal = irq_data_get_irq_chip_data(d);
 
 	/* This might be a bit of a questionable place to check */
-	if (!have_irq(internal, offset))
+	if (!have_irq(internal, *offset))
 		return -ENOTSUPP;
 
 	*gpio = internal;
-	*bank = to_bank(offset);
-	*bit = GPIO_BIT(offset);
+	*bank = to_bank(*offset);
+	*bit = GPIO_BIT(*offset);
 
 	return 0;
 }
@@ -458,17 +533,23 @@ static void aspeed_gpio_irq_ack(struct irq_data *d)
 	struct aspeed_gpio *gpio;
 	unsigned long flags;
 	void __iomem *status_addr;
+	int rc, offset;
+	bool copro;
 	u32 bit;
-	int rc;
 
-	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit);
+	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit, &offset);
 	if (rc)
 		return;
 
 	status_addr = bank_reg(gpio, bank, reg_irq_status);
 
 	spin_lock_irqsave(&gpio->lock, flags);
+	copro = aspeed_gpio_copro_request(gpio, offset);
+
 	iowrite32(bit, status_addr);
+
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 }
 
@@ -479,15 +560,17 @@ static void aspeed_gpio_irq_set_mask(struct irq_data *d, bool set)
 	unsigned long flags;
 	u32 reg, bit;
 	void __iomem *addr;
-	int rc;
+	int rc, offset;
+	bool copro;
 
-	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit);
+	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit, &offset);
 	if (rc)
 		return;
 
 	addr = bank_reg(gpio, bank, reg_irq_enable);
 
 	spin_lock_irqsave(&gpio->lock, flags);
+	copro = aspeed_gpio_copro_request(gpio, offset);
 
 	reg = ioread32(addr);
 	if (set)
@@ -496,6 +579,8 @@ static void aspeed_gpio_irq_set_mask(struct irq_data *d, bool set)
 		reg &= ~bit;
 	iowrite32(reg, addr);
 
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 }
 
@@ -520,9 +605,10 @@ static int aspeed_gpio_set_type(struct irq_data *d, unsigned int type)
 	struct aspeed_gpio *gpio;
 	unsigned long flags;
 	void __iomem *addr;
-	int rc;
+	int rc, offset;
+	bool copro;
 
-	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit);
+	rc = irqd_to_aspeed_gpio_data(d, &gpio, &bank, &bit, &offset);
 	if (rc)
 		return -EINVAL;
 
@@ -548,6 +634,7 @@ static int aspeed_gpio_set_type(struct irq_data *d, unsigned int type)
 	}
 
 	spin_lock_irqsave(&gpio->lock, flags);
+	copro = aspeed_gpio_copro_request(gpio, offset);
 
 	addr = bank_reg(gpio, bank, reg_irq_type0);
 	reg = ioread32(addr);
@@ -564,6 +651,8 @@ static int aspeed_gpio_set_type(struct irq_data *d, unsigned int type)
 	reg = (reg & ~bit) | type2;
 	iowrite32(reg, addr);
 
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 
 	irq_set_handler_locked(d, handler);
@@ -658,11 +747,14 @@ static int aspeed_gpio_reset_tolerance(struct gpio_chip *chip,
 	struct aspeed_gpio *gpio = gpiochip_get_data(chip);
 	unsigned long flags;
 	void __iomem *treg;
+	bool copro;
 	u32 val;
 
 	treg = bank_reg(gpio, to_bank(offset), reg_tolerance);
 
 	spin_lock_irqsave(&gpio->lock, flags);
+	copro = aspeed_gpio_copro_request(gpio, offset);
+
 	val = readl(treg);
 
 	if (enable)
@@ -671,6 +763,9 @@ static int aspeed_gpio_reset_tolerance(struct gpio_chip *chip,
 		val &= ~GPIO_BIT(offset);
 
 	writel(val, treg);
+
+	if (copro)
+		aspeed_gpio_copro_release(gpio, offset);
 	spin_unlock_irqrestore(&gpio->lock, flags);
 
 	return 0;
@@ -766,6 +861,9 @@ static void configure_timer(struct aspeed_gpio *gpio, unsigned int offset,
 	void __iomem *addr;
 	u32 val;
 
+	/* Note: Debounce timer isn't under control of the command
+	 * source registers, so no need to sync with the coprocessor
+	 */
 	addr = bank_reg(gpio, bank, reg_debounce_sel1);
 	val = ioread32(addr);
 	iowrite32((val & ~mask) | GPIO_SET_DEBOUNCE1(timer, offset), addr);
@@ -912,6 +1010,111 @@ static int aspeed_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
 	return -ENOTSUPP;
 }
 
+/**
+ * aspeed_gpio_copro_set_ops - Sets the callbacks used for handhsaking with
+ *                             the coprocessor for shared GPIO banks
+ * @ops: The callbacks
+ * @data: Pointer passed back to the callbacks
+ */
+int aspeed_gpio_copro_set_ops(const struct aspeed_gpio_copro_ops *ops, void *data)
+{
+	copro_data = data;
+	copro_ops = ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(aspeed_gpio_copro_set_ops);
+
+/**
+ * aspeed_gpio_copro_grab_gpio - Mark a GPIO used by the coprocessor. The entire
+ *                               bank gets marked and any access from the ARM will
+ *                               result in handshaking via callbacks.
+ * @desc: The GPIO to be marked
+ * @vreg_offset: If non-NULL, returns the value register offset in the GPIO space
+ * @dreg_offset: If non-NULL, returns the data latch register offset in the GPIO space
+ * @bit: If non-NULL, returns the bit number of the GPIO in the registers
+ */
+int aspeed_gpio_copro_grab_gpio(struct gpio_desc *desc,
+				u16 *vreg_offset, u16 *dreg_offset, u8 *bit)
+{
+	struct gpio_chip *chip = gpiod_to_chip(desc);
+	struct aspeed_gpio *gpio = gpiochip_get_data(chip);
+	int rc = 0, bindex, offset = gpio_chip_hwgpio(desc);
+	const struct aspeed_gpio_bank *bank = to_bank(offset);
+	unsigned long flags;
+
+	if (!gpio->cf_copro_bankmap)
+		gpio->cf_copro_bankmap = kzalloc(gpio->config->nr_gpios >> 3, GFP_KERNEL);
+	if (!gpio->cf_copro_bankmap)
+		return -ENOMEM;
+	if (offset < 0 || offset > gpio->config->nr_gpios)
+		return -EINVAL;
+	bindex = offset >> 3;
+
+	spin_lock_irqsave(&gpio->lock, flags);
+
+	/* Sanity check, this shouldn't happen */
+	if (gpio->cf_copro_bankmap[bindex] == 0xff) {
+		rc = -EIO;
+		goto bail;
+	}
+	gpio->cf_copro_bankmap[bindex]++;
+
+	/* Switch command source */
+	if (gpio->cf_copro_bankmap[bindex] == 1)
+		aspeed_gpio_change_cmd_source(gpio, bank, bindex,
+					      GPIO_CMDSRC_COLDFIRE);
+
+	if (vreg_offset)
+		*vreg_offset = bank->val_regs;
+	if (dreg_offset)
+		*dreg_offset = bank->rdata_reg;
+	if (bit)
+		*bit = GPIO_OFFSET(offset);
+ bail:
+	spin_unlock_irqrestore(&gpio->lock, flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(aspeed_gpio_copro_grab_gpio);
+
+/**
+ * aspeed_gpio_copro_release_gpio - Unmark a GPIO used by the coprocessor.
+ * @desc: The GPIO to be marked
+ */
+int aspeed_gpio_copro_release_gpio(struct gpio_desc *desc)
+{
+	struct gpio_chip *chip = gpiod_to_chip(desc);
+	struct aspeed_gpio *gpio = gpiochip_get_data(chip);
+	int rc = 0, bindex, offset = gpio_chip_hwgpio(desc);
+	const struct aspeed_gpio_bank *bank = to_bank(offset);
+	unsigned long flags;
+
+	if (!gpio->cf_copro_bankmap)
+		return -ENXIO;
+
+	if (offset < 0 || offset > gpio->config->nr_gpios)
+		return -EINVAL;
+	bindex = offset >> 3;
+
+	spin_lock_irqsave(&gpio->lock, flags);
+
+	/* Sanity check, this shouldn't happen */
+	if (gpio->cf_copro_bankmap[bindex] == 0) {
+		rc = -EIO;
+		goto bail;
+	}
+	gpio->cf_copro_bankmap[bindex]--;
+
+	/* Switch command source */
+	if (gpio->cf_copro_bankmap[bindex] == 0)
+		aspeed_gpio_change_cmd_source(gpio, bank, bindex,
+					      GPIO_CMDSRC_ARM);
+ bail:
+	spin_unlock_irqrestore(&gpio->lock, flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(aspeed_gpio_copro_release_gpio);
+
 /*
  * Any banks not specified in a struct aspeed_bank_props array are assumed to
  * have the properties:
@@ -1002,10 +1205,18 @@ static int __init aspeed_gpio_probe(struct platform_device *pdev)
 	if (!gpio->dcache)
 		return -ENOMEM;
 
-	/* Populate it with initial values read from the HW */
+	/*
+	 * Populate it with initial values read from the HW and switch
+	 * all command sources to the ARM by default
+	 */
 	for (i = 0; i < banks; i++) {
-		void __iomem *addr = bank_reg(gpio, &aspeed_gpio_banks[i], reg_rdata);
+		const struct aspeed_gpio_bank *bank = &aspeed_gpio_banks[i];
+		void __iomem *addr = bank_reg(gpio, bank, reg_rdata);
 		gpio->dcache[i] = ioread32(addr);
+		aspeed_gpio_change_cmd_source(gpio, bank, 0, GPIO_CMDSRC_ARM);
+		aspeed_gpio_change_cmd_source(gpio, bank, 1, GPIO_CMDSRC_ARM);
+		aspeed_gpio_change_cmd_source(gpio, bank, 2, GPIO_CMDSRC_ARM);
+		aspeed_gpio_change_cmd_source(gpio, bank, 3, GPIO_CMDSRC_ARM);
 	}
 
 	rc = devm_gpiochip_add_data(&pdev->dev, &gpio->chip, gpio);
diff --git a/include/linux/gpio/aspeed.h b/include/linux/gpio/aspeed.h
new file mode 100644
index 000000000000..1bfb3cdc86d0
--- /dev/null
+++ b/include/linux/gpio/aspeed.h
@@ -0,0 +1,15 @@
+#ifndef __GPIO_ASPEED_H
+#define __GPIO_ASPEED_H
+
+struct aspeed_gpio_copro_ops {
+	int (*request_access)(void *data);
+	int (*release_access)(void *data);
+};
+
+int aspeed_gpio_copro_grab_gpio(struct gpio_desc *desc,
+				u16 *vreg_offset, u16 *dreg_offset, u8 *bit);
+int aspeed_gpio_copro_release_gpio(struct gpio_desc *desc);
+int aspeed_gpio_copro_set_ops(const struct aspeed_gpio_copro_ops *ops, void *data);
+
+
+#endif /* __GPIO_ASPEED_H */
-- 
cgit v1.2.3


From 60a866685006bae0c3db20e4bae31887eb452ff1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 26 Jun 2018 06:49:08 -0300
Subject: gpio.h: fix location of gpio legacy documentation

The location of this doc file was moved. Change its reference
accordingly.

Fixes: 7ee2c13080c9 ("Documentation: gpio: Move legacy documentation to driver-api")
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/gpio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 91ed23468530..39745b8bdd65 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -14,7 +14,7 @@
 
 #include <linux/errno.h>
 
-/* see Documentation/gpio/gpio-legacy.txt */
+/* see Documentation/driver-api/gpio/legacy.rst */
 
 /* make these flag values available regardless of GPIO kconfig options */
 #define GPIOF_DIR_OUT	(0 << 0)
-- 
cgit v1.2.3


From 9cf57731b63e37ed995b46690adc604891a9a28f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Jun 2018 10:52:03 +0200
Subject: watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work

Oleg suggested to replace the "watchdog/%u" threads with
cpu_stop_work. That removes one thread per CPU while at the same time
fixes softlockup vs SCHED_DEADLINE.

But more importantly, it does away with the single
smpboot_update_cpumask_percpu_thread() user, which allows
cleanups/shrinkage of the smpboot interface.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 include/linux/nmi.h        |   5 ++
 kernel/cpu.c               |   5 ++
 kernel/watchdog.c          | 137 ++++++++++++++++++++-------------------------
 4 files changed, 71 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8796ba387152..4cf06a64bc02 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -164,6 +164,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
+	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b8d868d23e79..80664bbeca43 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -33,10 +33,15 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 #define sysctl_hardlockup_all_cpu_backtrace 0
 #endif /* !CONFIG_SMP */
 
+extern int lockup_detector_online_cpu(unsigned int cpu);
+extern int lockup_detector_offline_cpu(unsigned int cpu);
+
 #else /* CONFIG_LOCKUP_DETECTOR */
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
 static inline void lockup_detector_cleanup(void) { }
+#define lockup_detector_online_cpu	NULL
+#define lockup_detector_offline_cpu	NULL
 #endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0db8938fbb23..191097c45fb1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= perf_event_init_cpu,
 		.teardown.single	= perf_event_exit_cpu,
 	},
+	[CPUHP_AP_WATCHDOG_ONLINE] = {
+		.name			= "lockup_detector:online",
+		.startup.single		= lockup_detector_online_cpu,
+		.teardown.single	= lockup_detector_offline_cpu,
+	},
 	[CPUHP_AP_WORKQUEUE_ONLINE] = {
 		.name			= "workqueue:online",
 		.startup.single		= workqueue_online_cpu,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 576d18045811..b81f777838d5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -18,18 +18,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
-#include <linux/smpboot.h>
-#include <linux/sched/rt.h>
-#include <uapi/linux/sched/types.h>
 #include <linux/tick.h>
-#include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/isolation.h>
+#include <linux/stop_machine.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-#include <linux/kthread.h>
 
 static DEFINE_MUTEX(watchdog_mutex);
 
@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 
-static bool softlockup_threads_initialized __read_mostly;
+static bool softlockup_initialized __read_mostly;
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@@ -335,6 +330,25 @@ static void watchdog_interrupt_count(void)
 	__this_cpu_inc(hrtimer_interrupts);
 }
 
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static int softlockup_fn(void *data)
+{
+	__this_cpu_write(soft_lockup_hrtimer_cnt,
+			 __this_cpu_read(hrtimer_interrupts));
+	__touch_watchdog();
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
+
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -350,7 +364,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	watchdog_interrupt_count();
 
 	/* kick the softlockup detector */
-	wake_up_process(__this_cpu_read(softlockup_watchdog));
+	stop_one_cpu_nowait(smp_processor_id(),
+			softlockup_fn, NULL,
+			this_cpu_ptr(&softlockup_stop_work));
 
 	/* .. and repeat */
 	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@@ -448,17 +464,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	return HRTIMER_RESTART;
 }
 
-static void watchdog_set_prio(unsigned int policy, unsigned int prio)
-{
-	struct sched_param param = { .sched_priority = prio };
-
-	sched_setscheduler(current, policy, &param);
-}
-
 static void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
 	/*
 	 * Start the timer first to prevent the NMI watchdog triggering
 	 * before the timer has a chance to fire.
@@ -473,15 +484,14 @@ static void watchdog_enable(unsigned int cpu)
 	/* Enable the perf event */
 	if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
 		watchdog_nmi_enable(cpu);
-
-	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 }
 
 static void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
-	watchdog_set_prio(SCHED_NORMAL, 0);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
 	/*
 	 * Disable the perf event first. That prevents that a large delay
 	 * between disabling the timer and disabling the perf event causes
@@ -491,77 +501,63 @@ static void watchdog_disable(unsigned int cpu)
 	hrtimer_cancel(hrtimer);
 }
 
-static void watchdog_cleanup(unsigned int cpu, bool online)
+static int softlockup_stop_fn(void *data)
 {
-	watchdog_disable(cpu);
+	watchdog_disable(smp_processor_id());
+	return 0;
 }
 
-static int watchdog_should_run(unsigned int cpu)
+static void softlockup_stop_all(void)
 {
-	return __this_cpu_read(hrtimer_interrupts) !=
-		__this_cpu_read(soft_lockup_hrtimer_cnt);
+	int cpu;
+
+	if (!softlockup_initialized)
+		return;
+
+	for_each_cpu(cpu, &watchdog_allowed_mask)
+		smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
+
+	cpumask_clear(&watchdog_allowed_mask);
 }
 
-/*
- * The watchdog thread function - touches the timestamp.
- *
- * It only runs once every sample_period seconds (4 seconds by
- * default) to reset the softlockup timestamp. If this gets delayed
- * for more than 2*watchdog_thresh seconds then the debug-printout
- * triggers in watchdog_timer_fn().
- */
-static void watchdog(unsigned int cpu)
+static int softlockup_start_fn(void *data)
 {
-	__this_cpu_write(soft_lockup_hrtimer_cnt,
-			 __this_cpu_read(hrtimer_interrupts));
-	__touch_watchdog();
+	watchdog_enable(smp_processor_id());
+	return 0;
 }
 
-static struct smp_hotplug_thread watchdog_threads = {
-	.store			= &softlockup_watchdog,
-	.thread_should_run	= watchdog_should_run,
-	.thread_fn		= watchdog,
-	.thread_comm		= "watchdog/%u",
-	.setup			= watchdog_enable,
-	.cleanup		= watchdog_cleanup,
-	.park			= watchdog_disable,
-	.unpark			= watchdog_enable,
-};
-
-static void softlockup_update_smpboot_threads(void)
+static void softlockup_start_all(void)
 {
-	lockdep_assert_held(&watchdog_mutex);
-
-	if (!softlockup_threads_initialized)
-		return;
+	int cpu;
 
-	smpboot_update_cpumask_percpu_thread(&watchdog_threads,
-					     &watchdog_allowed_mask);
+	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+	for_each_cpu(cpu, &watchdog_allowed_mask)
+		smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
 }
 
-/* Temporarily park all watchdog threads */
-static void softlockup_park_all_threads(void)
+int lockup_detector_online_cpu(unsigned int cpu)
 {
-	cpumask_clear(&watchdog_allowed_mask);
-	softlockup_update_smpboot_threads();
+	watchdog_enable(cpu);
+	return 0;
 }
 
-/* Unpark enabled threads */
-static void softlockup_unpark_threads(void)
+int lockup_detector_offline_cpu(unsigned int cpu)
 {
-	cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
-	softlockup_update_smpboot_threads();
+	watchdog_disable(cpu);
+	return 0;
 }
 
 static void lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
-	softlockup_park_all_threads();
+
+	softlockup_stop_all();
 	set_sample_period();
 	lockup_detector_update_enable();
 	if (watchdog_enabled && watchdog_thresh)
-		softlockup_unpark_threads();
+		softlockup_start_all();
+
 	watchdog_nmi_start();
 	cpus_read_unlock();
 	/*
@@ -580,8 +576,6 @@ static void lockup_detector_reconfigure(void)
  */
 static __init void lockup_detector_setup(void)
 {
-	int ret;
-
 	/*
 	 * If sysctl is off and watchdog got disabled on the command line,
 	 * nothing to do here.
@@ -592,24 +586,13 @@ static __init void lockup_detector_setup(void)
 	    !(watchdog_enabled && watchdog_thresh))
 		return;
 
-	ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-						     &watchdog_allowed_mask);
-	if (ret) {
-		pr_err("Failed to initialize soft lockup detector threads\n");
-		return;
-	}
-
 	mutex_lock(&watchdog_mutex);
-	softlockup_threads_initialized = true;
 	lockup_detector_reconfigure();
+	softlockup_initialized = true;
 	mutex_unlock(&watchdog_mutex);
 }
 
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static inline int watchdog_park_threads(void) { return 0; }
-static inline void watchdog_unpark_threads(void) { }
-static inline int watchdog_enable_all_cpus(void) { return 0; }
-static inline void watchdog_disable_all_cpus(void) { }
 static void lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
-- 
cgit v1.2.3


From 167a88677b05d6a810f23b871cfb2b5db1808e60 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Jun 2018 10:53:01 +0200
Subject: smpboot: Remove cpumask from the API

Now that the sole use of the whole smpboot_*cpumask() API is gone,
remove it.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/smpboot.h | 15 +-------------
 kernel/smpboot.c        | 54 +++++--------------------------------------------
 2 files changed, 6 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index c174844cf663..d0884b525001 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -25,8 +25,6 @@ struct smpboot_thread_data;
  *			parked (cpu offline)
  * @unpark:		Optional unpark function, called when the thread is
  *			unparked (cpu online)
- * @cpumask:		Internal state.  To update which threads are unparked,
- *			call smpboot_update_cpumask_percpu_thread().
  * @selfparking:	Thread is not parked by the park function.
  * @thread_comm:	The base name of the thread
  */
@@ -40,23 +38,12 @@ struct smp_hotplug_thread {
 	void				(*cleanup)(unsigned int cpu, bool online);
 	void				(*park)(unsigned int cpu);
 	void				(*unpark)(unsigned int cpu);
-	cpumask_var_t			cpumask;
 	bool				selfparking;
 	const char			*thread_comm;
 };
 
-int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
-					   const struct cpumask *cpumask);
-
-static inline int
-smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
-{
-	return smpboot_register_percpu_thread_cpumask(plug_thread,
-						      cpu_possible_mask);
-}
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
 
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					  const struct cpumask *);
 
 #endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 5043e7433f4b..c230c2dd48e1 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
 
 	mutex_lock(&smpboot_threads_lock);
 	list_for_each_entry(cur, &hotplug_threads, list)
-		if (cpumask_test_cpu(cpu, cur->cpumask))
-			smpboot_unpark_thread(cur, cpu);
+		smpboot_unpark_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
 	return 0;
 }
@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 
 /**
- * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * smpboot_register_percpu_thread - Register a per_cpu thread related
  * 					    to hotplug
  * @plug_thread:	Hotplug thread descriptor
- * @cpumask:		The cpumask where threads run
  *
  * Creates and starts the threads on all online cpus.
  */
-int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
-					   const struct cpumask *cpumask)
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 {
 	unsigned int cpu;
 	int ret = 0;
 
-	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_copy(plug_thread->cpumask, cpumask);
-
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
 	for_each_online_cpu(cpu) {
 		ret = __smpboot_create_thread(plug_thread, cpu);
 		if (ret) {
 			smpboot_destroy_threads(plug_thread);
-			free_cpumask_var(plug_thread->cpumask);
 			goto out;
 		}
-		if (cpumask_test_cpu(cpu, cpumask))
-			smpboot_unpark_thread(plug_thread, cpu);
+		smpboot_unpark_thread(plug_thread, cpu);
 	}
 	list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -315,7 +306,7 @@ out:
 	put_online_cpus();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
 
 /**
  * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	smpboot_destroy_threads(plug_thread);
 	mutex_unlock(&smpboot_threads_lock);
 	put_online_cpus();
-	free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
 
-/**
- * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
- * @plug_thread:	Hotplug thread descriptor
- * @new:		Revised mask to use
- *
- * The cpumask field in the smp_hotplug_thread must not be updated directly
- * by the client, but only by calling this function.
- * This function can only be called on a registered smp_hotplug_thread.
- */
-void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					  const struct cpumask *new)
-{
-	struct cpumask *old = plug_thread->cpumask;
-	static struct cpumask tmp;
-	unsigned int cpu;
-
-	lockdep_assert_cpus_held();
-	mutex_lock(&smpboot_threads_lock);
-
-	/* Park threads that were exclusively enabled on the old mask. */
-	cpumask_andnot(&tmp, old, new);
-	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
-		smpboot_park_thread(plug_thread, cpu);
-
-	/* Unpark threads that are exclusively enabled on the new mask. */
-	cpumask_andnot(&tmp, new, old);
-	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
-		smpboot_unpark_thread(plug_thread, cpu);
-
-	cpumask_copy(old, new);
-
-	mutex_unlock(&smpboot_threads_lock);
-}
-
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
 /*
-- 
cgit v1.2.3


From 55f036ca7e74b85e34958af3d22121c656796413 Mon Sep 17 00:00:00 2001
From: Peter Ziljstra <peterz@infradead.org>
Date: Fri, 15 Jun 2018 10:07:12 +0200
Subject: locking: WW mutex cleanup

Make the WW mutex code more readable by adding comments, splitting up
functions and pointing out that we're actually using the Wait-Die
algorithm.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Co-authored-by: Thomas Hellstrom <thellstrom@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/ww-mutex-design.txt |  12 +-
 include/linux/ww_mutex.h                  |  28 ++---
 kernel/locking/mutex.c                    | 202 ++++++++++++++++++------------
 3 files changed, 145 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
index 34c3a1b50b9a..2fd7f2a2af21 100644
--- a/Documentation/locking/ww-mutex-design.txt
+++ b/Documentation/locking/ww-mutex-design.txt
@@ -32,10 +32,10 @@ the oldest task) wins, and the one with the higher reservation id (i.e. the
 younger task) unlocks all of the buffers that it has already locked, and then
 tries again.
 
-In the RDBMS literature this deadlock handling approach is called wait/wound:
+In the RDBMS literature this deadlock handling approach is called wait/die:
 The older tasks waits until it can acquire the contended lock. The younger tasks
 needs to back off and drop all the locks it is currently holding, i.e. the
-younger task is wounded.
+younger task dies.
 
 Concepts
 --------
@@ -56,9 +56,9 @@ Furthermore there are three different class of w/w lock acquire functions:
 
 * Normal lock acquisition with a context, using ww_mutex_lock.
 
-* Slowpath lock acquisition on the contending lock, used by the wounded task
-  after having dropped all already acquired locks. These functions have the
-  _slow postfix.
+* Slowpath lock acquisition on the contending lock, used by the task that just
+  killed its transaction after having dropped all already acquired locks.
+  These functions have the _slow postfix.
 
   From a simple semantics point-of-view the _slow functions are not strictly
   required, since simply calling the normal ww_mutex_lock functions on the
@@ -220,7 +220,7 @@ mutexes are a natural fit for such a case for two reasons:
 
 Note that this approach differs in two important ways from the above methods:
 - Since the list of objects is dynamically constructed (and might very well be
-  different when retrying due to hitting the -EDEADLK wound condition) there's
+  different when retrying due to hitting the -EDEADLK die condition) there's
   no need to keep any object on a persistent list when it's not locked. We can
   therefore move the list_head into the object itself.
 - On the other hand the dynamic object list construction also means that the -EALREADY return
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 39fda195bf78..f82fce2229c8 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -6,7 +6,7 @@
  *
  *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
- * Wound/wait implementation:
+ * Wait/Die implementation:
  *  Copyright (C) 2013 Canonical Ltd.
  *
  * This file contains the main data structure and API definitions.
@@ -28,9 +28,9 @@ struct ww_class {
 struct ww_acquire_ctx {
 	struct task_struct *task;
 	unsigned long stamp;
-	unsigned acquired;
+	unsigned int acquired;
 #ifdef CONFIG_DEBUG_MUTEXES
-	unsigned done_acquire;
+	unsigned int done_acquire;
 	struct ww_class *ww_class;
 	struct ww_mutex *contending_lock;
 #endif
@@ -38,8 +38,8 @@ struct ww_acquire_ctx {
 	struct lockdep_map dep_map;
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
-	unsigned deadlock_inject_interval;
-	unsigned deadlock_inject_countdown;
+	unsigned int deadlock_inject_interval;
+	unsigned int deadlock_inject_countdown;
 #endif
 };
 
@@ -102,7 +102,7 @@ static inline void ww_mutex_init(struct ww_mutex *lock,
  *
  * Context-based w/w mutex acquiring can be done in any order whatsoever within
  * a given lock class. Deadlocks will be detected and handled with the
- * wait/wound logic.
+ * wait/die logic.
  *
  * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
  * result in undetected deadlocks and is so forbidden. Mixing different contexts
@@ -195,13 +195,13 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
  * Lock the w/w mutex exclusively for this task.
  *
  * Deadlocks within a given w/w class of locks are detected and handled with the
- * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * wait/die algorithm. If the lock isn't immediately available this function
  * will either sleep until it is (wait case). Or it selects the current context
- * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * for backing off by returning -EDEADLK (die case). Trying to acquire the
  * same lock with the same context twice is also detected and signalled by
  * returning -EALREADY. Returns 0 if the mutex was successfully acquired.
  *
- * In the wound case the caller must release all currently held w/w mutexes for
+ * In the die case the caller must release all currently held w/w mutexes for
  * the given context and then wait for this contending lock to be available by
  * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this
  * lock and proceed with trying to acquire further w/w mutexes (e.g. when
@@ -226,14 +226,14 @@ extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acq
  * Lock the w/w mutex exclusively for this task.
  *
  * Deadlocks within a given w/w class of locks are detected and handled with the
- * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * wait/die algorithm. If the lock isn't immediately available this function
  * will either sleep until it is (wait case). Or it selects the current context
- * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * for backing off by returning -EDEADLK (die case). Trying to acquire the
  * same lock with the same context twice is also detected and signalled by
  * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a
  * signal arrives while waiting for the lock then this function returns -EINTR.
  *
- * In the wound case the caller must release all currently held w/w mutexes for
+ * In the die case the caller must release all currently held w/w mutexes for
  * the given context and then wait for this contending lock to be available by
  * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to
  * not acquire this lock and proceed with trying to acquire further w/w mutexes
@@ -256,7 +256,7 @@ extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
  * @lock: the mutex to be acquired
  * @ctx: w/w acquire context
  *
- * Acquires a w/w mutex with the given context after a wound case. This function
+ * Acquires a w/w mutex with the given context after a die case. This function
  * will sleep until the lock becomes available.
  *
  * The caller must have released all w/w mutexes already acquired with the
@@ -290,7 +290,7 @@ ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  * @lock: the mutex to be acquired
  * @ctx: w/w acquire context
  *
- * Acquires a w/w mutex with the given context after a wound case. This function
+ * Acquires a w/w mutex with the given context after a die case. This function
  * will sleep until the lock becomes available and returns 0 when the lock has
  * been acquired. If a signal arrives while waiting for the lock then this
  * function returns -EINTR.
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index f44f658ae629..cfe48419b7d0 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -244,6 +244,17 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
+/*
+ * Wait-Die:
+ *   The newer transactions are killed when:
+ *     It (the new transaction) makes a request for a lock being held
+ *     by an older transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
 static __always_inline void
 ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
 {
@@ -282,26 +293,53 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
 	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
 #endif
 	ww_ctx->acquired++;
+	ww->ctx = ww_ctx;
 }
 
+/*
+ * Determine if context @a is 'after' context @b. IOW, @a is a younger
+ * transaction than @b and depending on algorithm either needs to wait for
+ * @b or die.
+ */
 static inline bool __sched
 __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
 {
-	return a->stamp - b->stamp <= LONG_MAX &&
-	       (a->stamp != b->stamp || a > b);
+
+	return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a younger waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool __sched
+__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
+	       struct ww_acquire_ctx *ww_ctx)
+{
+	if (waiter->ww_ctx->acquired > 0 &&
+			__ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
+		debug_mutex_wake_waiter(lock, waiter);
+		wake_up_process(waiter->task);
+	}
+
+	return true;
 }
 
 /*
- * Wake up any waiters that may have to back off when the lock is held by the
- * given context.
+ * We just acquired @lock under @ww_ctx, if there are later contexts waiting
+ * behind us on the wait-list, check if they need to die.
  *
- * Due to the invariants on the wait list, this can only affect the first
- * waiter with a context.
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
  *
  * The current task must not be on the wait list.
  */
 static void __sched
-__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
 {
 	struct mutex_waiter *cur;
 
@@ -311,30 +349,23 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
 		if (!cur->ww_ctx)
 			continue;
 
-		if (cur->ww_ctx->acquired > 0 &&
-		    __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
-			debug_mutex_wake_waiter(lock, cur);
-			wake_up_process(cur->task);
-		}
-
-		break;
+		if (__ww_mutex_die(lock, cur, ww_ctx))
+			break;
 	}
 }
 
 /*
- * After acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
  */
 static __always_inline void
 ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	ww_mutex_lock_acquired(lock, ctx);
 
-	lock->ctx = ctx;
-
 	/*
 	 * The lock->ctx update should be visible on all cores before
-	 * the atomic read is done, otherwise contended waiters might be
+	 * the WAITERS check is done, otherwise contended waiters might be
 	 * missed. The contended waiters will either see ww_ctx == NULL
 	 * and keep spinning, or it will acquire wait_lock, add itself
 	 * to waiter list and sleep.
@@ -348,29 +379,14 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 		return;
 
 	/*
-	 * Uh oh, we raced in fastpath, wake up everyone in this case,
-	 * so they can see the new lock->ctx.
+	 * Uh oh, we raced in fastpath, check if any of the waiters need to
+	 * die.
 	 */
 	spin_lock(&lock->base.wait_lock);
-	__ww_mutex_wakeup_for_backoff(&lock->base, ctx);
+	__ww_mutex_check_waiters(&lock->base, ctx);
 	spin_unlock(&lock->base.wait_lock);
 }
 
-/*
- * After acquiring lock in the slowpath set ctx.
- *
- * Unlike for the fast path, the caller ensures that waiters are woken up where
- * necessary.
- *
- * Callers must hold the mutex wait_lock.
- */
-static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	ww_mutex_lock_acquired(lock, ctx);
-	lock->ctx = ctx;
-}
-
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 
 static inline
@@ -646,37 +662,73 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 }
 EXPORT_SYMBOL(ww_mutex_unlock);
 
+
+static __always_inline int __sched
+__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+	if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		struct ww_mutex *ww;
+
+		ww = container_of(lock, struct ww_mutex, base);
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+		ww_ctx->contending_lock = ww;
+#endif
+		return -EDEADLK;
+	}
+
+	return 0;
+}
+
+
+/*
+ * Check whether we need to kill the transaction for the current lock acquire.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ *           context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
 static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
-			    struct ww_acquire_ctx *ctx)
+__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
+		      struct ww_acquire_ctx *ctx)
 {
 	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
 	struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
 	struct mutex_waiter *cur;
 
+	if (ctx->acquired == 0)
+		return 0;
+
 	if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
-		goto deadlock;
+		return __ww_mutex_kill(lock, ctx);
 
 	/*
 	 * If there is a waiter in front of us that has a context, then its
-	 * stamp is earlier than ours and we must back off.
+	 * stamp is earlier than ours and we must kill ourself.
 	 */
 	cur = waiter;
 	list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
-		if (cur->ww_ctx)
-			goto deadlock;
+		if (!cur->ww_ctx)
+			continue;
+
+		return __ww_mutex_kill(lock, ctx);
 	}
 
 	return 0;
-
-deadlock:
-#ifdef CONFIG_DEBUG_MUTEXES
-	DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
-	ctx->contending_lock = ww;
-#endif
-	return -EDEADLK;
 }
 
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting.
+ */
 static inline int __sched
 __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 		      struct mutex *lock,
@@ -693,7 +745,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 	/*
 	 * Add the waiter before the first waiter with a higher stamp.
 	 * Waiters without a context are skipped to avoid starving
-	 * them.
+	 * them. Wait-Die waiters may die here.
 	 */
 	pos = &lock->wait_list;
 	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
@@ -701,34 +753,27 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 			continue;
 
 		if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
-			/* Back off immediately if necessary. */
-			if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
-				struct ww_mutex *ww;
+			/*
+			 * Wait-Die: if we find an older context waiting, there
+			 * is no point in queueing behind it, as we'd have to
+			 * die the moment it would acquire the lock.
+			 */
+			int ret = __ww_mutex_kill(lock, ww_ctx);
 
-				ww = container_of(lock, struct ww_mutex, base);
-				DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
-				ww_ctx->contending_lock = ww;
-#endif
-				return -EDEADLK;
-			}
+			if (ret)
+				return ret;
 
 			break;
 		}
 
 		pos = &cur->list;
 
-		/*
-		 * Wake up the waiter so that it gets a chance to back
-		 * off.
-		 */
-		if (cur->ww_ctx->acquired > 0) {
-			debug_mutex_wake_waiter(lock, cur);
-			wake_up_process(cur->task);
-		}
+		/* Wait-Die: ensure younger waiters die. */
+		__ww_mutex_die(lock, cur, ww_ctx);
 	}
 
 	list_add_tail(&waiter->list, pos);
+
 	return 0;
 }
 
@@ -772,7 +817,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 */
 	if (__mutex_trylock(lock)) {
 		if (use_ww_ctx && ww_ctx)
-			__ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+			__ww_mutex_check_waiters(lock, ww_ctx);
 
 		goto skip_wait;
 	}
@@ -790,10 +835,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
 #endif
 	} else {
-		/* Add in stamp order, waking up waiters that must back off. */
+		/*
+		 * Add in stamp order, waking up waiters that must kill
+		 * themselves.
+		 */
 		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
 		if (ret)
-			goto err_early_backoff;
+			goto err_early_kill;
 
 		waiter.ww_ctx = ww_ctx;
 	}
@@ -815,7 +863,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			goto acquired;
 
 		/*
-		 * Check for signals and wound conditions while holding
+		 * Check for signals and kill conditions while holding
 		 * wait_lock. This ensures the lock cancellation is ordered
 		 * against mutex_unlock() and wake-ups do not go missing.
 		 */
@@ -824,8 +872,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			goto err;
 		}
 
-		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
-			ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
+		if (use_ww_ctx && ww_ctx) {
+			ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
 			if (ret)
 				goto err;
 		}
@@ -870,7 +918,7 @@ skip_wait:
 	lock_acquired(&lock->dep_map, ip);
 
 	if (use_ww_ctx && ww_ctx)
-		ww_mutex_set_context_slowpath(ww, ww_ctx);
+		ww_mutex_lock_acquired(ww, ww_ctx);
 
 	spin_unlock(&lock->wait_lock);
 	preempt_enable();
@@ -879,7 +927,7 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
-err_early_backoff:
+err_early_kill:
 	spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
-- 
cgit v1.2.3


From 08295b3b5beec9aac0f7a9db86f0fc3792039da3 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Fri, 15 Jun 2018 10:17:38 +0200
Subject: locking: Implement an algorithm choice for Wound-Wait mutexes

The current Wound-Wait mutex algorithm is actually not Wound-Wait but
Wait-Die. Implement also Wound-Wait as a per-ww-class choice. Wound-Wait
is, contrary to Wait-Die a preemptive algorithm and is known to generate
fewer backoffs. Testing reveals that this is true if the
number of simultaneous contending transactions is small.
As the number of simultaneous contending threads increases, Wait-Wound
becomes inferior to Wait-Die in terms of elapsed time.
Possibly due to the larger number of held locks of sleeping transactions.

Update documentation and callers.

Timings using git://people.freedesktop.org/~thomash/ww_mutex_test
tag patch-18-06-15

Each thread runs 100000 batches of lock / unlock 800 ww mutexes randomly
chosen out of 100000. Four core Intel x86_64:

Algorithm    #threads       Rollbacks  time
Wound-Wait   4              ~100       ~17s.
Wait-Die     4              ~150000    ~19s.
Wound-Wait   16             ~360000    ~109s.
Wait-Die     16             ~450000    ~82s.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Co-authored-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/ww-mutex-design.txt |  57 +++++++++--
 drivers/dma-buf/reservation.c             |   2 +-
 drivers/gpu/drm/drm_modeset_lock.c        |   2 +-
 include/linux/ww_mutex.h                  |  17 ++-
 kernel/locking/locktorture.c              |   2 +-
 kernel/locking/mutex.c                    | 165 +++++++++++++++++++++++++++---
 kernel/locking/test-ww_mutex.c            |   2 +-
 lib/locking-selftest.c                    |   2 +-
 8 files changed, 213 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
index 2fd7f2a2af21..f0ed7c30e695 100644
--- a/Documentation/locking/ww-mutex-design.txt
+++ b/Documentation/locking/ww-mutex-design.txt
@@ -1,4 +1,4 @@
-Wait/Wound Deadlock-Proof Mutex Design
+Wound/Wait Deadlock-Proof Mutex Design
 ======================================
 
 Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
@@ -32,10 +32,26 @@ the oldest task) wins, and the one with the higher reservation id (i.e. the
 younger task) unlocks all of the buffers that it has already locked, and then
 tries again.
 
-In the RDBMS literature this deadlock handling approach is called wait/die:
-The older tasks waits until it can acquire the contended lock. The younger tasks
-needs to back off and drop all the locks it is currently holding, i.e. the
-younger task dies.
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
 
 Concepts
 --------
@@ -47,10 +63,12 @@ Acquire context: To ensure eventual forward progress it is important the a task
 trying to acquire locks doesn't grab a new reservation id, but keeps the one it
 acquired when starting the lock acquisition. This ticket is stored in the
 acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse.
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
 
 W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context.
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
 
 Furthermore there are three different class of w/w lock acquire functions:
 
@@ -90,6 +108,12 @@ provided.
 Usage
 -----
 
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
 Three different ways to acquire locks within the same w/w class. Common
 definitions for methods #1 and #2:
 
@@ -312,12 +336,23 @@ Design:
   We maintain the following invariants for the wait list:
   (1) Waiters with an acquire context are sorted by stamp order; waiters
       without an acquire context are interspersed in FIFO order.
-  (2) Among waiters with contexts, only the first one can have other locks
-      acquired already (ctx->acquired > 0). Note that this waiter may come
-      after other waiters without contexts in the list.
+  (2) For Wait-Die, among waiters with contexts, only the first one can have
+      other locks acquired already (ctx->acquired > 0). Note that this waiter
+      may come after other waiters without contexts in the list.
+
+  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+  The wounded status of the transaction is checked only when there is
+  contention for a new lock and hence a true chance of deadlock. In that
+  situation, if the transaction is wounded, it backs off, clears the
+  wounded status and retries. A great benefit of implementing preemption in
+  this way is that the wounded transaction can identify a contending lock to
+  wait for before restarting the transaction. Just blindly restarting the
+  transaction would likely make the transaction end up in a situation where
+  it would have to back off again.
 
   In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices.
+  serialize access to resources for devices, and optimization focus should
+  therefore be directed towards the uncontended cases.
 
 Lockdep:
   Special care has been taken to warn for as many cases of api abuse
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 314eb1071cce..20bf90f4ee63 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -46,7 +46,7 @@
  * write-side updates.
  */
 
-DEFINE_WW_CLASS(reservation_ww_class);
+DEFINE_WD_CLASS(reservation_ww_class);
 EXPORT_SYMBOL(reservation_ww_class);
 
 struct lock_class_key reservation_seqcount_class;
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 8a5100685875..638be2eb67b4 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -70,7 +70,7 @@
  * lists and lookup data structures.
  */
 
-static DEFINE_WW_CLASS(crtc_ww_class);
+static DEFINE_WD_CLASS(crtc_ww_class);
 
 /**
  * drm_modeset_lock_all - take all modeset locks
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index f82fce2229c8..3af7c0e03be5 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -8,6 +8,8 @@
  *
  * Wait/Die implementation:
  *  Copyright (C) 2013 Canonical Ltd.
+ * Choice of algorithm:
+ *  Copyright (C) 2018 WMWare Inc.
  *
  * This file contains the main data structure and API definitions.
  */
@@ -23,12 +25,15 @@ struct ww_class {
 	struct lock_class_key mutex_key;
 	const char *acquire_name;
 	const char *mutex_name;
+	unsigned int is_wait_die;
 };
 
 struct ww_acquire_ctx {
 	struct task_struct *task;
 	unsigned long stamp;
 	unsigned int acquired;
+	unsigned short wounded;
+	unsigned short is_wait_die;
 #ifdef CONFIG_DEBUG_MUTEXES
 	unsigned int done_acquire;
 	struct ww_class *ww_class;
@@ -58,17 +63,21 @@ struct ww_mutex {
 # define __WW_CLASS_MUTEX_INITIALIZER(lockname, class)
 #endif
 
-#define __WW_CLASS_INITIALIZER(ww_class) \
+#define __WW_CLASS_INITIALIZER(ww_class, _is_wait_die)	    \
 		{ .stamp = ATOMIC_LONG_INIT(0) \
 		, .acquire_name = #ww_class "_acquire" \
-		, .mutex_name = #ww_class "_mutex" }
+		, .mutex_name = #ww_class "_mutex" \
+		, .is_wait_die = _is_wait_die }
 
 #define __WW_MUTEX_INITIALIZER(lockname, class) \
 		{ .base =  __MUTEX_INITIALIZER(lockname.base) \
 		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
 
+#define DEFINE_WD_CLASS(classname) \
+	struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 1)
+
 #define DEFINE_WW_CLASS(classname) \
-	struct ww_class classname = __WW_CLASS_INITIALIZER(classname)
+	struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 0)
 
 #define DEFINE_WW_MUTEX(mutexname, ww_class) \
 	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
@@ -123,6 +132,8 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
 	ctx->task = current;
 	ctx->stamp = atomic_long_inc_return_relaxed(&ww_class->stamp);
 	ctx->acquired = 0;
+	ctx->wounded = false;
+	ctx->is_wait_die = ww_class->is_wait_die;
 #ifdef CONFIG_DEBUG_MUTEXES
 	ctx->ww_class = ww_class;
 	ctx->done_acquire = 0;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8402b3349dca..c28224347d69 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -365,7 +365,7 @@ static struct lock_torture_ops mutex_lock_ops = {
 };
 
 #include <linux/ww_mutex.h>
-static DEFINE_WW_CLASS(torture_ww_class);
+static DEFINE_WD_CLASS(torture_ww_class);
 static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
 static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
 static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cfe48419b7d0..1a81a1257b3f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -173,6 +173,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
 	return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
 }
 
+/*
+ * Add @waiter to a given location in the lock wait_list and set the
+ * FLAG_WAITERS flag if it's the first waiter.
+ */
+static void __sched
+__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+		   struct list_head *list)
+{
+	debug_mutex_add_waiter(lock, waiter, current);
+
+	list_add_tail(&waiter->list, list);
+	if (__mutex_waiter_is_first(lock, waiter))
+		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+}
+
 /*
  * Give up ownership to a specific task, when @task = NULL, this is equivalent
  * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
@@ -249,6 +264,11 @@ EXPORT_SYMBOL(mutex_lock);
  *   The newer transactions are killed when:
  *     It (the new transaction) makes a request for a lock being held
  *     by an older transaction.
+ *
+ * Wound-Wait:
+ *   The newer transactions are wounded when:
+ *     An older transaction makes a request for a lock being held by
+ *     the newer transaction.
  */
 
 /*
@@ -320,6 +340,9 @@ static bool __sched
 __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
 	       struct ww_acquire_ctx *ww_ctx)
 {
+	if (!ww_ctx->is_wait_die)
+		return false;
+
 	if (waiter->ww_ctx->acquired > 0 &&
 			__ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
 		debug_mutex_wake_waiter(lock, waiter);
@@ -329,13 +352,65 @@ __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
 	return true;
 }
 
+/*
+ * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with older transactions than
+ * the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct mutex *lock,
+			     struct ww_acquire_ctx *ww_ctx,
+			     struct ww_acquire_ctx *hold_ctx)
+{
+	struct task_struct *owner = __mutex_owner(lock);
+
+	lockdep_assert_held(&lock->wait_lock);
+
+	/*
+	 * Possible through __ww_mutex_add_waiter() when we race with
+	 * ww_mutex_set_context_fastpath(). In that case we'll get here again
+	 * through __ww_mutex_check_waiters().
+	 */
+	if (!hold_ctx)
+		return false;
+
+	/*
+	 * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+	 * it cannot go away because we'll have FLAG_WAITERS set and hold
+	 * wait_lock.
+	 */
+	if (!owner)
+		return false;
+
+	if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
+		hold_ctx->wounded = 1;
+
+		/*
+		 * wake_up_process() paired with set_current_state()
+		 * inserts sufficient barriers to make sure @owner either sees
+		 * it's wounded in __ww_mutex_lock_check_stamp() or has a
+		 * wakeup pending to re-read the wounded state.
+		 */
+		if (owner != current)
+			wake_up_process(owner);
+
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * We just acquired @lock under @ww_ctx, if there are later contexts waiting
- * behind us on the wait-list, check if they need to die.
+ * behind us on the wait-list, check if they need to die, or wound us.
  *
  * See __ww_mutex_add_waiter() for the list-order construction; basically the
  * list is ordered by stamp, smallest (oldest) first.
  *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
  * The current task must not be on the wait list.
  */
 static void __sched
@@ -349,7 +424,8 @@ __ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
 		if (!cur->ww_ctx)
 			continue;
 
-		if (__ww_mutex_die(lock, cur, ww_ctx))
+		if (__ww_mutex_die(lock, cur, ww_ctx) ||
+		    __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
 			break;
 	}
 }
@@ -370,17 +446,23 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	 * and keep spinning, or it will acquire wait_lock, add itself
 	 * to waiter list and sleep.
 	 */
-	smp_mb(); /* ^^^ */
+	smp_mb(); /* See comments above and below. */
 
 	/*
-	 * Check if lock is contended, if not there is nobody to wake up
+	 * [W] ww->ctx = ctx	    [W] MUTEX_FLAG_WAITERS
+	 *     MB		        MB
+	 * [R] MUTEX_FLAG_WAITERS   [R] ww->ctx
+	 *
+	 * The memory barrier above pairs with the memory barrier in
+	 * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+	 * and/or !empty list.
 	 */
 	if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
 		return;
 
 	/*
 	 * Uh oh, we raced in fastpath, check if any of the waiters need to
-	 * die.
+	 * die or wound us.
 	 */
 	spin_lock(&lock->base.wait_lock);
 	__ww_mutex_check_waiters(&lock->base, ctx);
@@ -682,7 +764,9 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
 
 
 /*
- * Check whether we need to kill the transaction for the current lock acquire.
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
  *
  * Wait-Die: If we're trying to acquire a lock already held by an older
  *           context, kill ourselves.
@@ -701,6 +785,13 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
 	if (ctx->acquired == 0)
 		return 0;
 
+	if (!ctx->is_wait_die) {
+		if (ctx->wounded)
+			return __ww_mutex_kill(lock, ctx);
+
+		return 0;
+	}
+
 	if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
 		return __ww_mutex_kill(lock, ctx);
 
@@ -727,7 +818,8 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
  * Waiters without context are interspersed in FIFO order.
  *
  * Furthermore, for Wait-Die kill ourself immediately when possible (there are
- * older contexts already waiting) to avoid unnecessary waiting.
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
  */
 static inline int __sched
 __ww_mutex_add_waiter(struct mutex_waiter *waiter,
@@ -736,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 {
 	struct mutex_waiter *cur;
 	struct list_head *pos;
+	bool is_wait_die;
 
 	if (!ww_ctx) {
-		list_add_tail(&waiter->list, &lock->wait_list);
+		__mutex_add_waiter(lock, waiter, &lock->wait_list);
 		return 0;
 	}
 
+	is_wait_die = ww_ctx->is_wait_die;
+
 	/*
 	 * Add the waiter before the first waiter with a higher stamp.
 	 * Waiters without a context are skipped to avoid starving
-	 * them. Wait-Die waiters may die here.
+	 * them. Wait-Die waiters may die here. Wound-Wait waiters
+	 * never die here, but they are sorted in stamp order and
+	 * may wound the lock holder.
 	 */
 	pos = &lock->wait_list;
 	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
@@ -758,10 +855,12 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 			 * is no point in queueing behind it, as we'd have to
 			 * die the moment it would acquire the lock.
 			 */
-			int ret = __ww_mutex_kill(lock, ww_ctx);
+			if (is_wait_die) {
+				int ret = __ww_mutex_kill(lock, ww_ctx);
 
-			if (ret)
-				return ret;
+				if (ret)
+					return ret;
+			}
 
 			break;
 		}
@@ -772,7 +871,23 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 		__ww_mutex_die(lock, cur, ww_ctx);
 	}
 
-	list_add_tail(&waiter->list, pos);
+	__mutex_add_waiter(lock, waiter, pos);
+
+	/*
+	 * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+	 * wound that such that we might proceed.
+	 */
+	if (!is_wait_die) {
+		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+		/*
+		 * See ww_mutex_set_context_fastpath(). Orders setting
+		 * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+		 * such that either we or the fastpath will wound @ww->ctx.
+		 */
+		smp_mb();
+		__ww_mutex_wound(lock, ww_ctx, ww->ctx);
+	}
 
 	return 0;
 }
@@ -796,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
+
+		/*
+		 * Reset the wounded flag after a kill. No other process can
+		 * race and wound us here since they can't have a valid owner
+		 * pointer if we don't have any locks held.
+		 */
+		if (ww_ctx->acquired == 0)
+			ww_ctx->wounded = 0;
 	}
 
 	preempt_disable();
@@ -829,7 +952,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	if (!use_ww_ctx) {
 		/* add waiting tasks to the end of the waitqueue (FIFO): */
-		list_add_tail(&waiter.list, &lock->wait_list);
+		__mutex_add_waiter(lock, &waiter, &lock->wait_list);
+
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
@@ -848,9 +972,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	waiter.task = current;
 
-	if (__mutex_waiter_is_first(lock, &waiter))
-		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
-
 	set_current_state(state);
 	for (;;) {
 		/*
@@ -907,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 acquired:
 	__set_current_state(TASK_RUNNING);
 
+	if (use_ww_ctx && ww_ctx) {
+		/*
+		 * Wound-Wait; we stole the lock (!first_waiter), check the
+		 * waiters as anyone might want to wound us.
+		 */
+		if (!ww_ctx->is_wait_die &&
+		    !__mutex_waiter_is_first(lock, &waiter))
+			__ww_mutex_check_waiters(lock, ww_ctx);
+	}
+
 	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
 		__mutex_clear_flag(lock, MUTEX_FLAGS);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 0e4cd64ad2c0..5b915b370d5a 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -26,7 +26,7 @@
 #include <linux/slab.h>
 #include <linux/ww_mutex.h>
 
-static DEFINE_WW_CLASS(ww_class);
+static DEFINE_WD_CLASS(ww_class);
 struct workqueue_struct *wq;
 
 struct test_mutex {
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index b5c1293ce147..1e1bbf171eca 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -29,7 +29,7 @@
  */
 static unsigned int debug_locks_verbose;
 
-static DEFINE_WW_CLASS(ww_lockdep);
+static DEFINE_WD_CLASS(ww_lockdep);
 
 static int __init setup_debug_locks_verbose(char *str)
 {
-- 
cgit v1.2.3


From 418cc6ca06071e7b7d75c0d525d80a6f49a763d6 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 3 May 2018 16:25:52 +0200
Subject: dma-fence: Make ->wait callback optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Almost everyone uses dma_fence_default_wait.

v2: Also remove the BUG_ON(!ops->wait) (Chris).

Reviewed-by: Christian König <christian.koenig@amd.com> (v1)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20180503142603.28513-5-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-fence-array.c |  1 -
 drivers/dma-buf/dma-fence.c       |  8 +++++---
 drivers/dma-buf/sw_sync.c         |  1 -
 include/linux/dma-fence.h         | 13 ++++++++-----
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index dd1edfb27b61..a8c254497251 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -104,7 +104,6 @@ const struct dma_fence_ops dma_fence_array_ops = {
 	.get_timeline_name = dma_fence_array_get_timeline_name,
 	.enable_signaling = dma_fence_array_enable_signaling,
 	.signaled = dma_fence_array_signaled,
-	.wait = dma_fence_default_wait,
 	.release = dma_fence_array_release,
 };
 EXPORT_SYMBOL(dma_fence_array_ops);
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index ea343f992112..7a92f85a4cec 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -158,7 +158,10 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout)
 		return -EINVAL;
 
 	trace_dma_fence_wait_start(fence);
-	ret = fence->ops->wait(fence, intr, timeout);
+	if (fence->ops->wait)
+		ret = fence->ops->wait(fence, intr, timeout);
+	else
+		ret = dma_fence_default_wait(fence, intr, timeout);
 	trace_dma_fence_wait_end(fence);
 	return ret;
 }
@@ -556,8 +559,7 @@ dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops,
 	       spinlock_t *lock, u64 context, unsigned seqno)
 {
 	BUG_ON(!lock);
-	BUG_ON(!ops || !ops->wait ||
-	       !ops->get_driver_name || !ops->get_timeline_name);
+	BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name);
 
 	kref_init(&fence->refcount);
 	fence->ops = ops;
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 3d78ca89a605..53c1d6d36a64 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -188,7 +188,6 @@ static const struct dma_fence_ops timeline_fence_ops = {
 	.get_timeline_name = timeline_fence_get_timeline_name,
 	.enable_signaling = timeline_fence_enable_signaling,
 	.signaled = timeline_fence_signaled,
-	.wait = dma_fence_default_wait,
 	.release = timeline_fence_release,
 	.fence_value_str = timeline_fence_value_str,
 	.timeline_value_str = timeline_fence_timeline_value_str,
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index c053d19e1e24..02dba8cd033d 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -191,11 +191,14 @@ struct dma_fence_ops {
 	/**
 	 * @wait:
 	 *
-	 * Custom wait implementation, or dma_fence_default_wait.
+	 * Custom wait implementation, defaults to dma_fence_default_wait() if
+	 * not set.
 	 *
-	 * Must not be NULL, set to dma_fence_default_wait for default implementation.
-	 * the dma_fence_default_wait implementation should work for any fence, as long
-	 * as enable_signaling works correctly.
+	 * The dma_fence_default_wait implementation should work for any fence, as long
+	 * as @enable_signaling works correctly. This hook allows drivers to
+	 * have an optimized version for the case where a process context is
+	 * already available, e.g. if @enable_signaling for the general case
+	 * needs to set up a worker thread.
 	 *
 	 * Must return -ERESTARTSYS if the wait is intr = true and the wait was
 	 * interrupted, and remaining jiffies if fence has signaled, or 0 if wait
@@ -203,7 +206,7 @@ struct dma_fence_ops {
 	 * which should be treated as if the fence is signaled. For example a hardware
 	 * lockup could be reported like that.
 	 *
-	 * This callback is mandatory.
+	 * This callback is optional.
 	 */
 	signed long (*wait)(struct dma_fence *fence,
 			    bool intr, signed long timeout);
-- 
cgit v1.2.3


From 8a9c58d28d0f66569737a3295116710ed24573cd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 2 Jul 2018 18:21:12 +0800
Subject: sctp: add support for dscp and flowlabel per transport

Like some other per transport params, flowlabel and dscp are added
in transport, asoc and sctp_sock. By default, transport sets its
value from asoc's, and asoc does it from sctp_sock. flowlabel
only works for ipv6 transport.

Other than that they need to be passed down in sctp_xmit, flow4/6
also needs to set them before looking up route in get_dst.

Note that it uses '& 0x100000' to check if flowlabel is set and
'& 0x1' (tos 1st bit is unused) to check if dscp is set by users,
so that they could be set to 0 by sockopt in next patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       |  7 +++++++
 include/net/sctp/structs.h |  9 +++++++++
 net/sctp/associola.c       |  7 +++++++
 net/sctp/ipv6.c            | 11 +++++++++--
 net/sctp/protocol.c        | 16 ++++++++++++----
 5 files changed, 44 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b36c76635f18..83d94341e003 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -801,4 +801,11 @@ struct sctp_strreset_resptsn {
 	__be32 receivers_next_tsn;
 };
 
+enum {
+	SCTP_DSCP_SET_MASK = 0x1,
+	SCTP_DSCP_VAL_MASK = 0xfc,
+	SCTP_FLOWLABEL_SET_MASK = 0x100000,
+	SCTP_FLOWLABEL_VAL_MASK = 0xfffff
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 701a51736fa5..ab869e0d8326 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -193,6 +193,9 @@ struct sctp_sock {
 	/* This is the max_retrans value for new associations. */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* The initial Path MTU to use for new associations. */
 	__u32 pathmtu;
 
@@ -895,6 +898,9 @@ struct sctp_transport {
 	 */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* This is the partially failed retrans value for the transport
 	 * and will be initialized from the assocs value.  This can be changed
 	 * using the SCTP_PEER_ADDR_THLDS socket option
@@ -1772,6 +1778,9 @@ struct sctp_association {
 	 */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* Flag that path mtu update is pending */
 	__u8   pmtu_pending;
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a16204d50..16ecfbc95614 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
 	/* Initialize path max retrans value. */
 	asoc->pathmaxrxt = sp->pathmaxrxt;
 
+	asoc->flowlabel = sp->flowlabel;
+	asoc->dscp = sp->dscp;
+
 	/* Initialize default path MTU. */
 	asoc->pathmtu = sp->pathmtu;
 
@@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	peer->sackdelay = asoc->sackdelay;
 	peer->sackfreq = asoc->sackfreq;
 
+	if (addr->sa.sa_family == AF_INET6)
+		peer->flowlabel = asoc->flowlabel;
+	peer->dscp = asoc->dscp;
+
 	/* Enable/disable heartbeat, SACK delay, and path MTU discovery
 	 * based on association setting.
 	 */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0cd2e764f47f..38102bf7f13e 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 	struct sock *sk = skb->sk;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flowi6 *fl6 = &transport->fl.u.ip6;
+	__u8 tclass = np->tclass;
 	int res;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
 		 skb->len, &fl6->saddr, &fl6->daddr);
 
-	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+	if (transport->dscp & SCTP_DSCP_SET_MASK)
+		tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+
+	if (INET_ECN_is_capable(tclass))
+		IP6_ECN_flow_xmit(sk, fl6->flowlabel);
 
 	if (!(transport->param_flags & SPP_PMTUD_ENABLE))
 		skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 
 	rcu_read_lock();
 	res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
-		       np->tclass);
+		       tclass);
 	rcu_read_unlock();
 	return res;
 }
@@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		fl6->flowi6_oif = daddr->v6.sin6_scope_id;
 	else if (asoc)
 		fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
+	if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
+		fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
 
 	pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 67f73d3a1356..e948db29ab53 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	struct dst_entry *dst = NULL;
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
+	__u8 tos = inet_sk(sk)->tos;
 
+	if (t->dscp & SCTP_DSCP_SET_MASK)
+		tos = t->dscp & SCTP_DSCP_VAL_MASK;
 	memset(fl4, 0x0, sizeof(struct flowi4));
 	fl4->daddr  = daddr->v4.sin_addr.s_addr;
 	fl4->fl4_dport = daddr->v4.sin_port;
 	fl4->flowi4_proto = IPPROTO_SCTP;
 	if (asoc) {
-		fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+		fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
 		fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
 		fl4->fl4_sport = htons(asoc->base.bind_addr.port);
 	}
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		fl4->fl4_sport = laddr->a.v4.sin_port;
 		flowi4_update_output(fl4,
 				     asoc->base.sk->sk_bound_dev_if,
-				     RT_CONN_FLAGS(asoc->base.sk),
+				     RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
 				     daddr->v4.sin_addr.s_addr,
 				     laddr->a.v4.sin_addr.s_addr);
 
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
 			       struct sctp_transport *transport)
 {
 	struct inet_sock *inet = inet_sk(skb->sk);
+	__u8 dscp = inet->tos;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
-		 skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+		 skb->len, &transport->fl.u.ip4.saddr,
+		 &transport->fl.u.ip4.daddr);
+
+	if (transport->dscp & SCTP_DSCP_SET_MASK)
+		dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
 
 	inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
 			 IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
 
 	SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
 
-	return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+	return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
 }
 
 static struct sctp_af sctp_af_inet;
-- 
cgit v1.2.3


From f6ad8c1bcdf014272d08c55b9469536952a0a771 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:12:45 +0100
Subject: net: core: trivial netif_receive_skb_list() entry point

Just calls netif_receive_skb() in a loop.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 64480a0f2c16..f67258f057ca 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3477,6 +3477,7 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
+void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index 08d58e0debe5..85c456a4b551 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4906,6 +4906,25 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+/**
+ *	netif_receive_skb_list - process many receive buffers from network
+ *	@head: list of skbs to process.
+ *
+ *	For now, just calls netif_receive_skb() in a loop, ignoring the
+ *	return value.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list)
+		netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
 DEFINE_PER_CPU(struct work_struct, flush_works);
 
 /* Network device is going away, flush any packets still pending */
-- 
cgit v1.2.3


From 4ce0017a373afaaa9ef17614d8fa4f6fde261d18 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:13:40 +0100
Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling

First example of a layer splitting the list (rather than merely taking
 individual packets off it).
Involves new list.h function, list_cut_before(), like list_cut_position()
 but cuts on the other side of the given entry.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 30 ++++++++++++++++++++++++++++++
 net/core/dev.c       | 44 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 4b129df4d46b..de04cc5ed536 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list,
 		__list_cut_position(list, head, entry);
 }
 
+/**
+ * list_cut_before - cut a list into two, before given entry
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *
+ * This helper moves the initial part of @head, up to but
+ * excluding @entry, from @head to @list.  You should pass
+ * in @entry an element you know is on @head.  @list should
+ * be an empty list or a list you do not care about losing
+ * its data.
+ * If @entry == @head, all entries on @head are moved to
+ * @list.
+ */
+static inline void list_cut_before(struct list_head *list,
+				   struct list_head *head,
+				   struct list_head *entry)
+{
+	if (head->next == entry) {
+		INIT_LIST_HEAD(list);
+		return;
+	}
+	list->next = head->next;
+	list->next->prev = list;
+	list->prev = entry->prev;
+	list->prev->next = list;
+	head->next = entry;
+	entry->prev = head;
+}
+
 static inline void __list_splice(const struct list_head *list,
 				 struct list_head *prev,
 				 struct list_head *next)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e87361df2ab..9aadef976e8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4784,6 +4784,14 @@ int netif_receive_skb_core(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list)
+		__netif_receive_skb_core(skb, pfmemalloc);
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
@@ -4809,6 +4817,34 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
+static void __netif_receive_skb_list(struct list_head *head)
+{
+	unsigned long noreclaim_flag = 0;
+	struct sk_buff *skb, *next;
+	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+			struct list_head sublist;
+
+			/* Handle the previous sublist */
+			list_cut_before(&sublist, head, &skb->list);
+			__netif_receive_skb_list_core(&sublist, pfmemalloc);
+			pfmemalloc = !pfmemalloc;
+			/* See comments in __netif_receive_skb */
+			if (pfmemalloc)
+				noreclaim_flag = memalloc_noreclaim_save();
+			else
+				memalloc_noreclaim_restore(noreclaim_flag);
+		}
+	}
+	/* Handle the remaining sublist */
+	__netif_receive_skb_list_core(head, pfmemalloc);
+	/* Restore pflags */
+	if (pfmemalloc)
+		memalloc_noreclaim_restore(noreclaim_flag);
+}
+
 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4843,14 +4879,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 	return ret;
 }
 
-static void __netif_receive_skb_list(struct list_head *head)
-{
-	struct sk_buff *skb, *next;
-
-	list_for_each_entry_safe(skb, next, head, list)
-		__netif_receive_skb(skb);
-}
-
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.2.3


From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 2 Jul 2018 16:14:12 +0100
Subject: net: ipv4: listified version of ip_rcv

Also involved adding a way to run a netfilter hook over a list of packets.
 Rather than attempting to make netfilter know about lists (which would be
 a major project in itself) we just let it call the regular okfn (in this
 case ip_rcv_finish()) for any packets it steals, and have it give us back
 a list of packets it's synchronously accepted (which normally NF_HOOK
 would automatically call okfn() on, but we want to be able to potentially
 pass the list to a listified version of okfn().)
The netfilter hooks themselves are indirect calls that still happen per-
 packet (see nf_hook_entry_hookfn()), but again, changing that can be left
 for future work.

There is potential for out-of-order receives if the netfilter hook ends up
 synchronously stealing packets, as they will be processed before any
 accepts earlier in the list.  However, it was already possible for an
 asynchronous accept to cause out-of-order receives, so presumably this is
 considered OK.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 include/linux/netfilter.h | 22 +++++++++++++++
 include/net/ip.h          |  2 ++
 net/core/dev.c            |  8 +++---
 net/ipv4/af_inet.c        |  1 +
 net/ipv4/ip_input.c       | 68 ++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 94 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f67258f057ca..c1ef749b6f9f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2297,6 +2297,9 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	void			(*list_func) (struct list_head *,
+					      struct packet_type *,
+					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
 					    struct sock *sk);
 	void			*af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index dd2052f0efb7..5a5e0a2ab2a3 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	return ret;
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct list_head *head, struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+		if (ret != 1)
+			list_del(&skb->list);
+	}
+}
+
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
 		  unsigned int len);
@@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	return okfn(net, sk, skb);
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct list_head *head, struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	/* nothing to do */
+}
+
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 			  struct sock *sk, struct sk_buff *skb,
 			  struct net_device *indev, struct net_device *outdev,
diff --git a/include/net/ip.h b/include/net/ip.h
index 09da79d8ceea..99d1b835d2aa 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		 struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1bc485bb0678..5e22719ce71d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4806,9 +4806,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 		return;
 	if (list_empty(head))
 		return;
-
-	list_for_each_entry_safe(skb, next, head, list)
-		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	if (pt_prev->list_func != NULL)
+		pt_prev->list_func(head, pt_prev, orig_dev);
+	else
+		list_for_each_entry_safe(skb, next, head, list)
+			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9263a2c114e0..c716be13d58c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init);
 static struct packet_type ip_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.func = ip_rcv,
+	.list_func = ip_list_rcv,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..914240830bdf 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -408,10 +408,9 @@ drop_error:
 /*
  * 	Main IP Receive routine.
  */
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 {
 	const struct iphdr *iph;
-	struct net *net;
 	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 
 
-	net = dev_net(dev);
 	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);
 
-	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
-		       net, NULL, skb, dev, NULL,
-		       ip_rcv_finish);
+	return skb;
 
 csum_error:
 	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +496,63 @@ inhdr_error:
 drop:
 	kfree_skb(skb);
 out:
-	return NET_RX_DROP;
+	return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+	   struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+
+	skb = ip_rcv_core(skb, net);
+	if (skb == NULL)
+		return NET_RX_DROP;
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		       net, NULL, skb, dev, NULL,
+		       ip_rcv_finish);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net)
+{
+	struct sk_buff *skb, *next;
+
+	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+		     head, dev, NULL, ip_rcv_finish);
+	list_for_each_entry_safe(skb, next, head, list)
+		ip_rcv_finish(net, NULL, skb);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		 struct net_device *orig_dev)
+{
+	struct net_device *curr_dev = NULL;
+	struct net *curr_net = NULL;
+	struct sk_buff *skb, *next;
+	struct list_head sublist;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		struct net_device *dev = skb->dev;
+		struct net *net = dev_net(dev);
+
+		skb = ip_rcv_core(skb, net);
+		if (skb == NULL)
+			continue;
+
+		if (curr_dev != dev || curr_net != net) {
+			/* dispatch old sublist */
+			list_cut_before(&sublist, head, &skb->list);
+			if (!list_empty(&sublist))
+				ip_sublist_rcv(&sublist, dev, net);
+			/* start new sublist */
+			curr_dev = dev;
+			curr_net = net;
+		}
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv(head, curr_dev, curr_net);
 }
-- 
cgit v1.2.3


From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 3 Jul 2018 15:42:53 -0700
Subject: net/sched: Introduce the ETF Qdisc

The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.

For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.

Example:

$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
           map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
           clockid CLOCK_TAI

In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_TAI and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.

The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.

The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().

Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid
to be configured, but it's been decided that usage of CLOCK_TAI should
be enforced until we decide to allow for other clockids to be used.
The rationale here is that PTP times are usually in the TAI scale, thus
no other clocks should be necessary. For now, the qdisc will return
EINVAL if any clocks other than CLOCK_TAI are used.

Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |   1 +
 include/uapi/linux/pkt_sched.h |  17 ++
 net/sched/Kconfig              |  11 ++
 net/sched/Makefile             |   1 +
 net/sched/sch_etf.c            | 384 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 414 insertions(+)
 create mode 100644 net/sched/sch_etf.c

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1ef749b6f9f..f06ee8f91e74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -798,6 +798,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
 	TC_SETUP_QDISC_MQ,
+	TC_SETUP_QDISC_ETF,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index bad3c03bcf43..d5e933ce1447 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -937,4 +937,21 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+	__s32 delta;
+	__s32 clockid;
+	__u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON	BIT(0)
+};
+
+enum {
+	TCA_ETF_UNSPEC,
+	TCA_ETF_PARMS,
+	__TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..fcc89706745b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_cbs.
 
+config NET_SCH_ETF
+	tristate "Earliest TxTime First (ETF)"
+	help
+	  Say Y here if you want to use the Earliest TxTime First (ETF) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_etf.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_etf.
+
 config NET_SCH_GRED
 	tristate "Generic Random Early Detection (GRED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..9a5a7077d217 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..4b7f4903ac17
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c  Earliest TxTime First queueing discipline.
+ *
+ * Authors:	Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ *		Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+
+struct etf_sched_data {
+	bool deadline_mode;
+	int clockid;
+	int queue;
+	s32 delta; /* in ns */
+	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+	struct rb_root head;
+	struct qdisc_watchdog watchdog;
+	ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+	[TCA_ETF_PARMS]	= { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+					struct netlink_ext_ack *extack)
+{
+	/* Check if params comply to the following rules:
+	 *	* Clockid and delta must be valid.
+	 *
+	 *	* Dynamic clockids are not supported.
+	 *
+	 *	* Delta must be a positive integer.
+	 */
+	if (qopt->clockid < 0) {
+		NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+		return -ENOTSUPP;
+	}
+
+	if (qopt->clockid != CLOCK_TAI) {
+		NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
+		return -EINVAL;
+	}
+
+	if (qopt->delta < 0) {
+		NL_SET_ERR_MSG(extack, "Delta must be positive");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	ktime_t txtime = nskb->tstamp;
+	struct sock *sk = nskb->sk;
+	ktime_t now;
+
+	if (!sk)
+		return false;
+
+	if (!sock_flag(sk, SOCK_TXTIME))
+		return false;
+
+	/* We don't perform crosstimestamping.
+	 * Drop if packet's clockid differs from qdisc's.
+	 */
+	if (sk->sk_clockid != q->clockid)
+		return false;
+
+	if (sk->sk_txtime_deadline_mode != q->deadline_mode)
+		return false;
+
+	now = q->get_time();
+	if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	p = rb_first(&q->head);
+	if (!p)
+		return NULL;
+
+	return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = etf_peek_timesortedlist(sch);
+	ktime_t next;
+
+	if (!skb)
+		return;
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+				      struct sk_buff **to_free)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node **p = &q->head.rb_node, *parent = NULL;
+	ktime_t txtime = nskb->tstamp;
+
+	if (!is_packet_valid(sch, nskb))
+		return qdisc_drop(nskb, sch, to_free);
+
+	while (*p) {
+		struct sk_buff *skb;
+
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (ktime_after(txtime, skb->tstamp))
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&nskb->rbnode, parent, p);
+	rb_insert_color(&nskb->rbnode, &q->head);
+
+	qdisc_qstats_backlog_inc(sch, nskb);
+	sch->q.qlen++;
+
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+				 bool drop)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	rb_erase(&skb->rbnode, &q->head);
+
+	/* The rbnode field in the skb re-uses these fields, now that
+	 * we are done with the rbnode, reset them.
+	 */
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+
+	qdisc_qstats_backlog_dec(sch, skb);
+
+	if (drop) {
+		struct sk_buff *to_free = NULL;
+
+		qdisc_drop(skb, sch, &to_free);
+		kfree_skb_list(to_free);
+		qdisc_qstats_overlimit(sch);
+	} else {
+		qdisc_bstats_update(sch, skb);
+
+		q->last = skb->tstamp;
+	}
+
+	sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	ktime_t now, next;
+
+	skb = etf_peek_timesortedlist(sch);
+	if (!skb)
+		return NULL;
+
+	now = q->get_time();
+
+	/* Drop if packet has expired while in queue. */
+	/* FIXME: Must return error on the socket's error queue */
+	if (ktime_before(skb->tstamp, now)) {
+		timesortedlist_erase(sch, skb, true);
+		skb = NULL;
+		goto out;
+	}
+
+	/* When in deadline mode, dequeue as soon as possible and change the
+	 * txtime from deadline to (now + delta).
+	 */
+	if (q->deadline_mode) {
+		timesortedlist_erase(sch, skb, false);
+		skb->tstamp = now;
+		goto out;
+	}
+
+	next = ktime_sub_ns(skb->tstamp, q->delta);
+
+	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
+	if (ktime_after(now, next))
+		timesortedlist_erase(sch, skb, false);
+	else
+		skb = NULL;
+
+out:
+	/* Now we may need to re-arm the qdisc watchdog for the next packet. */
+	reset_watchdog(sch);
+
+	return skb;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+		    struct netlink_ext_ack *extack)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct nlattr *tb[TCA_ETF_MAX + 1];
+	struct tc_etf_qopt *qopt;
+	int err;
+
+	if (!opt) {
+		NL_SET_ERR_MSG(extack,
+			       "Missing ETF qdisc options which are mandatory");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_ETF_PARMS]) {
+		NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+		return -EINVAL;
+	}
+
+	qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+	pr_debug("delta %d clockid %d deadline %s\n",
+		 qopt->delta, qopt->clockid,
+		 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+	err = validate_input_params(qopt, extack);
+	if (err < 0)
+		return err;
+
+	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+	/* Everything went OK, save the parameters used. */
+	q->delta = qopt->delta;
+	q->clockid = qopt->clockid;
+	q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+	switch (q->clockid) {
+	case CLOCK_REALTIME:
+		q->get_time = ktime_get_real;
+		break;
+	case CLOCK_MONOTONIC:
+		q->get_time = ktime_get;
+		break;
+	case CLOCK_BOOTTIME:
+		q->get_time = ktime_get_boottime;
+		break;
+	case CLOCK_TAI:
+		q->get_time = ktime_get_clocktai;
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Clockid is not supported");
+		return -ENOTSUPP;
+	}
+
+	qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+	return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p = rb_first(&q->head);
+
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+
+		rb_erase(&skb->rbnode, &q->head);
+		rtnl_kfree_skbs(skb, skb);
+		sch->q.qlen--;
+	}
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+
+	/* No matter which mode we are on, it's safe to clear both lists. */
+	timesortedlist_clear(sch);
+	__qdisc_reset_queue(&sch->q);
+
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+
+	q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	/* Only cancel watchdog if it's been initialized. */
+	if (q->watchdog.qdisc == sch)
+		qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+	struct tc_etf_qopt opt = { };
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+
+	opt.delta = q->delta;
+	opt.clockid = q->clockid;
+	if (q->deadline_mode)
+		opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+	if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+	.id		=	"etf",
+	.priv_size	=	sizeof(struct etf_sched_data),
+	.enqueue	=	etf_enqueue_timesortedlist,
+	.dequeue	=	etf_dequeue_timesortedlist,
+	.peek		=	etf_peek_timesortedlist,
+	.init		=	etf_init,
+	.reset		=	etf_reset,
+	.destroy	=	etf_destroy,
+	.dump		=	etf_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+	return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+	unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From b5cb15d9372abc9adc4e844c0c1bf594ca6a7695 Mon Sep 17 00:00:00 2001
From: Chris von Recklinghausen <crecklin@redhat.com>
Date: Tue, 3 Jul 2018 15:43:08 -0400
Subject: usercopy: Allow boot cmdline disabling of hardening

Enabling HARDENED_USERCOPY may cause measurable regressions in networking
performance: up to 8% under UDP flood.

I ran a small packet UDP flood using pktgen vs. a host b2b connected. On
the receiver side the UDP packets are processed by a simple user space
process that just reads and drops them:

https://github.com/netoptimizer/network-testing/blob/master/src/udp_sink.c

Not very useful from a functional PoV, but it helps to pin-point
bottlenecks in the networking stack.

When running a kernel with CONFIG_HARDENED_USERCOPY=y, I see a 5-8%
regression in the receive tput, compared to the same kernel without this
option enabled.

With CONFIG_HARDENED_USERCOPY=y, perf shows ~6% of CPU time spent
cumulatively in __check_object_size (~4%) and __virt_addr_valid (~2%).

The call-chain is:

__GI___libc_recvfrom
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_recvfrom
__sys_recvfrom
inet_recvmsg
udp_recvmsg
__check_object_size

udp_recvmsg() actually calls copy_to_iter() (inlined) and the latters
calls check_copy_size() (again, inlined).

A generic distro may want to enable HARDENED_USERCOPY in their default
kernel config, but at the same time, such distro may want to be able to
avoid the performance penalties in with the default configuration and
disable the stricter check on a per-boot basis.

This change adds a boot parameter that conditionally disables
HARDENED_USERCOPY via "hardened_usercopy=off".

Signed-off-by: Chris von Recklinghausen <crecklin@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 11 +++++++++++
 include/linux/jump_label.h                      |  6 ++++++
 mm/usercopy.c                                   | 25 +++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index efc7aa7a0670..560d4dc66f02 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -816,6 +816,17 @@
 	disable=	[IPV6]
 			See Documentation/networking/ipv6.txt.
 
+	hardened_usercopy=
+                        [KNL] Under CONFIG_HARDENED_USERCOPY, whether
+                        hardening is enabled for this boot. Hardened
+                        usercopy checking is used to protect the kernel
+                        from reading or writing beyond known memory
+                        allocation boundaries as a proactive defense
+                        against bounds-checking flaws in the kernel's
+                        copy_to_user()/copy_from_user() interface.
+                on      Perform hardened usercopy checks (default).
+                off     Disable hardened usercopy checks.
+
 	disable_radix	[PPC]
 			Disable RADIX MMU mode on POWER9
 
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index b46b541c67c4..1a0b6f17a5d6 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -299,12 +299,18 @@ struct static_key_false {
 #define DEFINE_STATIC_KEY_TRUE(name)	\
 	struct static_key_true name = STATIC_KEY_TRUE_INIT
 
+#define DEFINE_STATIC_KEY_TRUE_RO(name)	\
+	struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT
+
 #define DECLARE_STATIC_KEY_TRUE(name)	\
 	extern struct static_key_true name
 
 #define DEFINE_STATIC_KEY_FALSE(name)	\
 	struct static_key_false name = STATIC_KEY_FALSE_INIT
 
+#define DEFINE_STATIC_KEY_FALSE_RO(name)	\
+	struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT
+
 #define DECLARE_STATIC_KEY_FALSE(name)	\
 	extern struct static_key_false name
 
diff --git a/mm/usercopy.c b/mm/usercopy.c
index e9e9325f7638..852eb4e53f06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -20,6 +20,8 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
 #include <asm/sections.h>
 
 /*
@@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
 	}
 }
 
+static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+
 /*
  * Validates that the given object is:
  * - not bogus address
@@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
  */
 void __check_object_size(const void *ptr, unsigned long n, bool to_user)
 {
+	if (static_branch_unlikely(&bypass_usercopy_checks))
+		return;
+
 	/* Skip all tests if size is zero. */
 	if (!n)
 		return;
@@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
 	check_kernel_text_object((const unsigned long)ptr, n, to_user);
 }
 EXPORT_SYMBOL(__check_object_size);
+
+static bool enable_checks __initdata = true;
+
+static int __init parse_hardened_usercopy(char *str)
+{
+	return strtobool(str, &enable_checks);
+}
+
+__setup("hardened_usercopy=", parse_hardened_usercopy);
+
+static int __init set_hardened_usercopy(void)
+{
+	if (enable_checks == false)
+		static_branch_enable(&bypass_usercopy_checks);
+	return 1;
+}
+
+late_initcall(set_hardened_usercopy);
-- 
cgit v1.2.3


From 6312fe77751f57d4fa2b28abeef84c6a95c28136 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 5 Jul 2018 14:34:32 +0800
Subject: net: limit each hash list length to MAX_GRO_SKBS

After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash
table.")' there is 8 hash buckets, which allows more flows to be held for
merging.  but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still,
limit the hash table performance.

keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not
the total 8 skb

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++-
 net/core/dev.c            | 56 +++++++++++++++++++----------------------------
 2 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f06ee8f91e74..b683971e500d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -302,6 +302,11 @@ struct netdev_boot_setup {
 
 int __init netdev_boot_setup(char *str);
 
+struct gro_list {
+	struct list_head	list;
+	int			count;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -323,7 +328,7 @@ struct napi_struct {
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
-	struct list_head	gro_hash[GRO_HASH_BUCKETS];
+	struct gro_list		gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
 	struct hrtimer		timer;
 	struct list_head	dev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7e6a2f66db5c..89825c1eccdc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -149,7 +149,6 @@
 
 #include "net-sysfs.h"
 
-/* Instead of increasing this, you should create a hash table. */
 #define MAX_GRO_SKBS 8
 
 /* This should be increased if a protocol with a bigger head is added. */
@@ -5151,9 +5150,10 @@ out:
 	return netif_receive_skb_internal(skb);
 }
 
-static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head,
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 				   bool flush_old)
 {
+	struct list_head *head = &napi->gro_hash[index].list;
 	struct sk_buff *skb, *p;
 
 	list_for_each_entry_safe_reverse(skb, p, head, list) {
@@ -5162,22 +5162,20 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *h
 		list_del_init(&skb->list);
 		napi_gro_complete(skb);
 		napi->gro_count--;
+		napi->gro_hash[index].count--;
 	}
 }
 
-/* napi->gro_hash contains packets ordered by age.
+/* napi->gro_hash[].list contains packets ordered by age.
  * youngest packets at the head of it.
  * Complete skbs in reverse order to reduce latencies.
  */
 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	int i;
-
-	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
-		struct list_head *head = &napi->gro_hash[i];
+	u32 i;
 
-		__napi_gro_flush_chain(napi, head, flush_old);
-	}
+	for (i = 0; i < GRO_HASH_BUCKETS; i++)
+		__napi_gro_flush_chain(napi, i, flush_old);
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
@@ -5189,7 +5187,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
 	struct list_head *head;
 	struct sk_buff *p;
 
-	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 	list_for_each_entry(p, head, list) {
 		unsigned long diffs;
 
@@ -5257,27 +5255,13 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 	}
 }
 
-static void gro_flush_oldest(struct napi_struct *napi)
+static void gro_flush_oldest(struct list_head *head)
 {
-	struct sk_buff *oldest = NULL;
-	unsigned long age = jiffies;
-	int i;
-
-	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
-		struct list_head *head = &napi->gro_hash[i];
-		struct sk_buff *skb;
-
-		if (list_empty(head))
-			continue;
+	struct sk_buff *oldest;
 
-		skb = list_last_entry(head, struct sk_buff, list);
-		if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
-			oldest = skb;
-			age = NAPI_GRO_CB(skb)->age;
-		}
-	}
+	oldest = list_last_entry(head, struct sk_buff, list);
 
-	/* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+	/* We are called with head length >= MAX_GRO_SKBS, so this is
 	 * impossible.
 	 */
 	if (WARN_ON_ONCE(!oldest))
@@ -5292,6 +5276,7 @@ static void gro_flush_oldest(struct napi_struct *napi)
 
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
+	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 	struct list_head *head = &offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
@@ -5358,6 +5343,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		list_del_init(&pp->list);
 		napi_gro_complete(pp);
 		napi->gro_count--;
+		napi->gro_hash[hash].count--;
 	}
 
 	if (same_flow)
@@ -5366,10 +5352,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (NAPI_GRO_CB(skb)->flush)
 		goto normal;
 
-	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
-		gro_flush_oldest(napi);
+	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+		gro_flush_oldest(gro_head);
 	} else {
 		napi->gro_count++;
+		napi->gro_hash[hash].count++;
 	}
 	NAPI_GRO_CB(skb)->count = 1;
 	NAPI_GRO_CB(skb)->age = jiffies;
@@ -6006,8 +5993,10 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
-	for (i = 0; i < GRO_HASH_BUCKETS; i++)
-		INIT_LIST_HEAD(&napi->gro_hash[i]);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		INIT_LIST_HEAD(&napi->gro_hash[i].list);
+		napi->gro_hash[i].count = 0;
+	}
 	napi->skb = NULL;
 	napi->poll = poll;
 	if (weight > NAPI_POLL_WEIGHT)
@@ -6047,8 +6036,9 @@ static void flush_gro_hash(struct napi_struct *napi)
 	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 		struct sk_buff *skb, *n;
 
-		list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list)
+		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 			kfree_skb(skb);
+		napi->gro_hash[i].count = 0;
 	}
 }
 
-- 
cgit v1.2.3


From 0380cf7dbaca75c524e34b30979f0806124fa8e6 Mon Sep 17 00:00:00 2001
From: pascal paillet <p.paillet@st.com>
Date: Thu, 5 Jul 2018 14:25:57 +0000
Subject: regulator: core: Change suspend_late to suspend

Change suspend_late ops to suspend normal ops. The goal is to avoid
requesting all the regulator drivers to be operational in suspend late
phase.

Signed-off-by: pascal paillet <p.paillet@st.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 26 +++++++++++++-------------
 include/linux/regulator/driver.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 6ed568b96c0e..da9b0fed8330 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -4441,7 +4441,7 @@ void regulator_unregister(struct regulator_dev *rdev)
 EXPORT_SYMBOL_GPL(regulator_unregister);
 
 #ifdef CONFIG_SUSPEND
-static int _regulator_suspend_late(struct device *dev, void *data)
+static int _regulator_suspend(struct device *dev, void *data)
 {
 	struct regulator_dev *rdev = dev_to_rdev(dev);
 	suspend_state_t *state = data;
@@ -4455,20 +4455,20 @@ static int _regulator_suspend_late(struct device *dev, void *data)
 }
 
 /**
- * regulator_suspend_late - prepare regulators for system wide suspend
+ * regulator_suspend - prepare regulators for system wide suspend
  * @state: system suspend state
  *
  * Configure each regulator with it's suspend operating parameters for state.
  */
-static int regulator_suspend_late(struct device *dev)
+static int regulator_suspend(struct device *dev)
 {
 	suspend_state_t state = pm_suspend_target_state;
 
 	return class_for_each_device(&regulator_class, NULL, &state,
-				     _regulator_suspend_late);
+				     _regulator_suspend);
 }
 
-static int _regulator_resume_early(struct device *dev, void *data)
+static int _regulator_resume(struct device *dev, void *data)
 {
 	int ret = 0;
 	struct regulator_dev *rdev = dev_to_rdev(dev);
@@ -4481,35 +4481,35 @@ static int _regulator_resume_early(struct device *dev, void *data)
 
 	regulator_lock(rdev);
 
-	if (rdev->desc->ops->resume_early &&
+	if (rdev->desc->ops->resume &&
 	    (rstate->enabled == ENABLE_IN_SUSPEND ||
 	     rstate->enabled == DISABLE_IN_SUSPEND))
-		ret = rdev->desc->ops->resume_early(rdev);
+		ret = rdev->desc->ops->resume(rdev);
 
 	regulator_unlock(rdev);
 
 	return ret;
 }
 
-static int regulator_resume_early(struct device *dev)
+static int regulator_resume(struct device *dev)
 {
 	suspend_state_t state = pm_suspend_target_state;
 
 	return class_for_each_device(&regulator_class, NULL, &state,
-				     _regulator_resume_early);
+				     _regulator_resume);
 }
 
 #else /* !CONFIG_SUSPEND */
 
-#define regulator_suspend_late	NULL
-#define regulator_resume_early	NULL
+#define regulator_suspend	NULL
+#define regulator_resume	NULL
 
 #endif /* !CONFIG_SUSPEND */
 
 #ifdef CONFIG_PM
 static const struct dev_pm_ops __maybe_unused regulator_pm_ops = {
-	.suspend_late	= regulator_suspend_late,
-	.resume_early	= regulator_resume_early,
+	.suspend	= regulator_suspend,
+	.resume		= regulator_resume,
 };
 #endif
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index dea96ee39fdc..0fd8fbb74763 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -220,7 +220,7 @@ struct regulator_ops {
 	/* set regulator suspend operating mode (defined in consumer.h) */
 	int (*set_suspend_mode) (struct regulator_dev *, unsigned int mode);
 
-	int (*resume_early)(struct regulator_dev *rdev);
+	int (*resume)(struct regulator_dev *rdev);
 
 	int (*set_pull_down) (struct regulator_dev *);
 };
-- 
cgit v1.2.3


From d8842211b6f63d3f069df973d137de0a85964965 Mon Sep 17 00:00:00 2001
From: pascal paillet <p.paillet@st.com>
Date: Thu, 5 Jul 2018 14:25:56 +0000
Subject: driver core: Add device_link_remove function

Device_link_remove uses the same arguments than device_link_add. The Goal
is to avoid storing the link pointer.

Signed-off-by: pascal paillet <p.paillet@st.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/core.c    | 30 ++++++++++++++++++++++++++++++
 include/linux/device.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 36622b52e419..3b380b1f2768 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -365,6 +365,36 @@ void device_link_del(struct device_link *link)
 }
 EXPORT_SYMBOL_GPL(device_link_del);
 
+/**
+ * device_link_remove - remove a link between two devices.
+ * @consumer: Consumer end of the link.
+ * @supplier: Supplier end of the link.
+ *
+ * The caller must ensure proper synchronization of this function with runtime
+ * PM.
+ */
+void device_link_remove(void *consumer, struct device *supplier)
+{
+	struct device_link *link;
+
+	if (WARN_ON(consumer == supplier))
+		return;
+
+	device_links_write_lock();
+	device_pm_lock();
+
+	list_for_each_entry(link, &supplier->links.consumers, s_node) {
+		if (link->consumer == consumer) {
+			kref_put(&link->kref, __device_link_del);
+			break;
+		}
+	}
+
+	device_pm_unlock();
+	device_links_write_unlock();
+}
+EXPORT_SYMBOL_GPL(device_link_remove);
+
 static void device_links_missing_supplier(struct device *dev)
 {
 	struct device_link *link;
diff --git a/include/linux/device.h b/include/linux/device.h
index 055a69dbcd18..9c1c3b1d5e11 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1316,6 +1316,7 @@ extern const char *dev_driver_string(const struct device *dev);
 struct device_link *device_link_add(struct device *consumer,
 				    struct device *supplier, u32 flags);
 void device_link_del(struct device_link *link);
+void device_link_remove(void *consumer, struct device *supplier);
 
 #ifdef CONFIG_PRINTK
 
-- 
cgit v1.2.3


From 2282e125a406e09331c5a785e3df29035c99a607 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Jul 2018 22:05:21 +0200
Subject: leds: triggers: let struct led_trigger::activate() return an error
 code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given that activating a trigger can fail, let the callback return an
indication. This prevents to have a trigger active according to the
"trigger" sysfs attribute but not functional.

All users are changed accordingly to return 0 for now. There is no intended
change in behaviour.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-triggers.c               | 24 +++++++++++++++++++++---
 drivers/leds/trigger/ledtrig-activity.c   |  8 +++++---
 drivers/leds/trigger/ledtrig-backlight.c  |  8 +++++---
 drivers/leds/trigger/ledtrig-default-on.c |  3 ++-
 drivers/leds/trigger/ledtrig-gpio.c       |  8 +++++---
 drivers/leds/trigger/ledtrig-heartbeat.c  |  8 +++++---
 drivers/leds/trigger/ledtrig-netdev.c     |  8 +++++---
 drivers/leds/trigger/ledtrig-oneshot.c    |  8 +++++---
 drivers/leds/trigger/ledtrig-timer.c      |  8 +++++---
 drivers/leds/trigger/ledtrig-transient.c  |  8 +++++---
 drivers/tty/vt/keyboard.c                 |  4 +++-
 drivers/usb/core/ledtrig-usbport.c        |  7 ++++---
 include/linux/leds.h                      | 14 +++++++++-----
 net/bluetooth/leds.c                      |  6 ++++--
 net/mac80211/led.c                        | 20 +++++++++++++++-----
 net/rfkill/core.c                         |  4 +++-
 16 files changed, 101 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 431123b048a2..a8786f4b3453 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -103,15 +103,16 @@ ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
 EXPORT_SYMBOL_GPL(led_trigger_show);
 
 /* Caller must ensure led_cdev->trigger_lock held */
-void led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
+int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 {
 	unsigned long flags;
 	char *event = NULL;
 	char *envp[2];
 	const char *name;
+	int ret;
 
 	if (!led_cdev->trigger && !trig)
-		return;
+		return 0;
 
 	name = trig ? trig->name : "none";
 	event = kasprintf(GFP_KERNEL, "TRIGGER=%s", name);
@@ -134,8 +135,14 @@ void led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 		list_add_tail(&led_cdev->trig_list, &trig->led_cdevs);
 		write_unlock_irqrestore(&trig->leddev_list_lock, flags);
 		led_cdev->trigger = trig;
+
 		if (trig->activate)
-			trig->activate(led_cdev);
+			ret = trig->activate(led_cdev);
+		else
+			ret = 0;
+
+		if (ret)
+			goto err_activate;
 	}
 
 	if (event) {
@@ -146,6 +153,17 @@ void led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 				"%s: Error sending uevent\n", __func__);
 		kfree(event);
 	}
+
+	return 0;
+
+err_activate:
+	led_cdev->trigger = NULL;
+	write_lock_irqsave(&led_cdev->trigger->leddev_list_lock, flags);
+	list_del(&led_cdev->trig_list);
+	write_unlock_irqrestore(&led_cdev->trigger->leddev_list_lock, flags);
+	led_set_brightness(led_cdev, LED_OFF);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(led_trigger_set);
 
diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c
index b39e1bb9a9ec..2fc065fb1754 100644
--- a/drivers/leds/trigger/ledtrig-activity.c
+++ b/drivers/leds/trigger/ledtrig-activity.c
@@ -178,20 +178,20 @@ static ssize_t led_invert_store(struct device *dev,
 
 static DEVICE_ATTR(invert, 0644, led_invert_show, led_invert_store);
 
-static void activity_activate(struct led_classdev *led_cdev)
+static int activity_activate(struct led_classdev *led_cdev)
 {
 	struct activity_data *activity_data;
 	int rc;
 
 	activity_data = kzalloc(sizeof(*activity_data), GFP_KERNEL);
 	if (!activity_data)
-		return;
+		return 0;
 
 	led_cdev->trigger_data = activity_data;
 	rc = device_create_file(led_cdev->dev, &dev_attr_invert);
 	if (rc) {
 		kfree(led_cdev->trigger_data);
-		return;
+		return 0;
 	}
 
 	activity_data->led_cdev = led_cdev;
@@ -201,6 +201,8 @@ static void activity_activate(struct led_classdev *led_cdev)
 	led_activity_function(&activity_data->timer);
 	set_bit(LED_BLINK_SW, &led_cdev->work_flags);
 	led_cdev->activated = true;
+
+	return 0;
 }
 
 static void activity_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/leds/trigger/ledtrig-backlight.c b/drivers/leds/trigger/ledtrig-backlight.c
index 1ca1f1608f76..84512960d630 100644
--- a/drivers/leds/trigger/ledtrig-backlight.c
+++ b/drivers/leds/trigger/ledtrig-backlight.c
@@ -97,7 +97,7 @@ static ssize_t bl_trig_invert_store(struct device *dev,
 }
 static DEVICE_ATTR(inverted, 0644, bl_trig_invert_show, bl_trig_invert_store);
 
-static void bl_trig_activate(struct led_classdev *led)
+static int bl_trig_activate(struct led_classdev *led)
 {
 	int ret;
 
@@ -107,7 +107,7 @@ static void bl_trig_activate(struct led_classdev *led)
 	led->trigger_data = n;
 	if (!n) {
 		dev_err(led->dev, "unable to allocate backlight trigger\n");
-		return;
+		return 0;
 	}
 
 	ret = device_create_file(led->dev, &dev_attr_inverted);
@@ -124,11 +124,13 @@ static void bl_trig_activate(struct led_classdev *led)
 		dev_err(led->dev, "unable to register backlight trigger\n");
 	led->activated = true;
 
-	return;
+	return 0;
 
 err_invert:
 	led->trigger_data = NULL;
 	kfree(n);
+
+	return 0;
 }
 
 static void bl_trig_deactivate(struct led_classdev *led)
diff --git a/drivers/leds/trigger/ledtrig-default-on.c b/drivers/leds/trigger/ledtrig-default-on.c
index 4ccea04b7a6b..2996fe672198 100644
--- a/drivers/leds/trigger/ledtrig-default-on.c
+++ b/drivers/leds/trigger/ledtrig-default-on.c
@@ -16,9 +16,10 @@
 #include <linux/leds.h>
 #include "../leds.h"
 
-static void defon_trig_activate(struct led_classdev *led_cdev)
+static int defon_trig_activate(struct led_classdev *led_cdev)
 {
 	led_set_brightness_nosleep(led_cdev, led_cdev->max_brightness);
+	return 0;
 }
 
 static struct led_trigger defon_led_trigger = {
diff --git a/drivers/leds/trigger/ledtrig-gpio.c b/drivers/leds/trigger/ledtrig-gpio.c
index 93906a17a4b6..f5358c40d03f 100644
--- a/drivers/leds/trigger/ledtrig-gpio.c
+++ b/drivers/leds/trigger/ledtrig-gpio.c
@@ -162,14 +162,14 @@ static ssize_t gpio_trig_gpio_store(struct device *dev,
 }
 static DEVICE_ATTR(gpio, 0644, gpio_trig_gpio_show, gpio_trig_gpio_store);
 
-static void gpio_trig_activate(struct led_classdev *led)
+static int gpio_trig_activate(struct led_classdev *led)
 {
 	struct gpio_trig_data *gpio_data;
 	int ret;
 
 	gpio_data = kzalloc(sizeof(*gpio_data), GFP_KERNEL);
 	if (!gpio_data)
-		return;
+		return 0;
 
 	ret = device_create_file(led->dev, &dev_attr_gpio);
 	if (ret)
@@ -187,7 +187,7 @@ static void gpio_trig_activate(struct led_classdev *led)
 	led->trigger_data = gpio_data;
 	led->activated = true;
 
-	return;
+	return 0;
 
 err_brightness:
 	device_remove_file(led->dev, &dev_attr_inverted);
@@ -197,6 +197,8 @@ err_inverted:
 
 err_gpio:
 	kfree(gpio_data);
+
+	return 0;
 }
 
 static void gpio_trig_deactivate(struct led_classdev *led)
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index 304b929edb8e..03a8dfce8fb9 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -121,21 +121,21 @@ static ssize_t led_invert_store(struct device *dev,
 
 static DEVICE_ATTR(invert, 0644, led_invert_show, led_invert_store);
 
-static void heartbeat_trig_activate(struct led_classdev *led_cdev)
+static int heartbeat_trig_activate(struct led_classdev *led_cdev)
 {
 	struct heartbeat_trig_data *heartbeat_data;
 	int rc;
 
 	heartbeat_data = kzalloc(sizeof(*heartbeat_data), GFP_KERNEL);
 	if (!heartbeat_data)
-		return;
+		return 0;
 
 	led_cdev->trigger_data = heartbeat_data;
 	heartbeat_data->led_cdev = led_cdev;
 	rc = device_create_file(led_cdev->dev, &dev_attr_invert);
 	if (rc) {
 		kfree(led_cdev->trigger_data);
-		return;
+		return 0;
 	}
 
 	timer_setup(&heartbeat_data->timer, led_heartbeat_function, 0);
@@ -145,6 +145,8 @@ static void heartbeat_trig_activate(struct led_classdev *led_cdev)
 	led_heartbeat_function(&heartbeat_data->timer);
 	set_bit(LED_BLINK_SW, &led_cdev->work_flags);
 	led_cdev->activated = true;
+
+	return 0;
 }
 
 static void heartbeat_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index 6df4781a6308..bb05937255de 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -388,14 +388,14 @@ static void netdev_trig_work(struct work_struct *work)
 			(atomic_read(&trigger_data->interval)*2));
 }
 
-static void netdev_trig_activate(struct led_classdev *led_cdev)
+static int netdev_trig_activate(struct led_classdev *led_cdev)
 {
 	struct led_netdev_data *trigger_data;
 	int rc;
 
 	trigger_data = kzalloc(sizeof(struct led_netdev_data), GFP_KERNEL);
 	if (!trigger_data)
-		return;
+		return 0;
 
 	spin_lock_init(&trigger_data->lock);
 
@@ -432,7 +432,7 @@ static void netdev_trig_activate(struct led_classdev *led_cdev)
 	rc = register_netdevice_notifier(&trigger_data->notifier);
 	if (rc)
 		goto err_out_interval;
-	return;
+	return 0;
 
 err_out_interval:
 	device_remove_file(led_cdev->dev, &dev_attr_interval);
@@ -447,6 +447,8 @@ err_out_device_name:
 err_out:
 	led_cdev->trigger_data = NULL;
 	kfree(trigger_data);
+
+	return 0;
 }
 
 static void netdev_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/leds/trigger/ledtrig-oneshot.c b/drivers/leds/trigger/ledtrig-oneshot.c
index b8ea9f0f1e19..7bbf92bd7f80 100644
--- a/drivers/leds/trigger/ledtrig-oneshot.c
+++ b/drivers/leds/trigger/ledtrig-oneshot.c
@@ -122,14 +122,14 @@ static DEVICE_ATTR(delay_off, 0644, led_delay_off_show, led_delay_off_store);
 static DEVICE_ATTR(invert, 0644, led_invert_show, led_invert_store);
 static DEVICE_ATTR(shot, 0200, NULL, led_shot);
 
-static void oneshot_trig_activate(struct led_classdev *led_cdev)
+static int oneshot_trig_activate(struct led_classdev *led_cdev)
 {
 	struct oneshot_trig_data *oneshot_data;
 	int rc;
 
 	oneshot_data = kzalloc(sizeof(*oneshot_data), GFP_KERNEL);
 	if (!oneshot_data)
-		return;
+		return 0;
 
 	led_cdev->trigger_data = oneshot_data;
 
@@ -151,7 +151,7 @@ static void oneshot_trig_activate(struct led_classdev *led_cdev)
 
 	led_cdev->activated = true;
 
-	return;
+	return 0;
 
 err_out_invert:
 	device_remove_file(led_cdev->dev, &dev_attr_invert);
@@ -161,6 +161,8 @@ err_out_delayon:
 	device_remove_file(led_cdev->dev, &dev_attr_delay_on);
 err_out_trig_data:
 	kfree(led_cdev->trigger_data);
+
+	return 0;
 }
 
 static void oneshot_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/leds/trigger/ledtrig-timer.c b/drivers/leds/trigger/ledtrig-timer.c
index 10fc0966b0e3..527055d815ad 100644
--- a/drivers/leds/trigger/ledtrig-timer.c
+++ b/drivers/leds/trigger/ledtrig-timer.c
@@ -70,7 +70,7 @@ static ssize_t led_delay_off_store(struct device *dev,
 static DEVICE_ATTR(delay_on, 0644, led_delay_on_show, led_delay_on_store);
 static DEVICE_ATTR(delay_off, 0644, led_delay_off_show, led_delay_off_store);
 
-static void timer_trig_activate(struct led_classdev *led_cdev)
+static int timer_trig_activate(struct led_classdev *led_cdev)
 {
 	int rc;
 
@@ -78,7 +78,7 @@ static void timer_trig_activate(struct led_classdev *led_cdev)
 
 	rc = device_create_file(led_cdev->dev, &dev_attr_delay_on);
 	if (rc)
-		return;
+		return 0;
 	rc = device_create_file(led_cdev->dev, &dev_attr_delay_off);
 	if (rc)
 		goto err_out_delayon;
@@ -87,10 +87,12 @@ static void timer_trig_activate(struct led_classdev *led_cdev)
 		      &led_cdev->blink_delay_off);
 	led_cdev->activated = true;
 
-	return;
+	return 0;
 
 err_out_delayon:
 	device_remove_file(led_cdev->dev, &dev_attr_delay_on);
+
+	return 0;
 }
 
 static void timer_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/leds/trigger/ledtrig-transient.c b/drivers/leds/trigger/ledtrig-transient.c
index 9d1769073562..a55fc58179a9 100644
--- a/drivers/leds/trigger/ledtrig-transient.c
+++ b/drivers/leds/trigger/ledtrig-transient.c
@@ -152,7 +152,7 @@ static DEVICE_ATTR(duration, 0644, transient_duration_show,
 		   transient_duration_store);
 static DEVICE_ATTR(state, 0644, transient_state_show, transient_state_store);
 
-static void transient_trig_activate(struct led_classdev *led_cdev)
+static int transient_trig_activate(struct led_classdev *led_cdev)
 {
 	int rc;
 	struct transient_trig_data *tdata;
@@ -161,7 +161,7 @@ static void transient_trig_activate(struct led_classdev *led_cdev)
 	if (!tdata) {
 		dev_err(led_cdev->dev,
 			"unable to allocate transient trigger\n");
-		return;
+		return 0;
 	}
 	led_cdev->trigger_data = tdata;
 	tdata->led_cdev = led_cdev;
@@ -181,7 +181,7 @@ static void transient_trig_activate(struct led_classdev *led_cdev)
 	timer_setup(&tdata->timer, transient_timer_function, 0);
 	led_cdev->activated = true;
 
-	return;
+	return 0;
 
 err_out_state:
 	device_remove_file(led_cdev->dev, &dev_attr_duration);
@@ -191,6 +191,8 @@ err_out:
 	dev_err(led_cdev->dev, "unable to register transient trigger\n");
 	led_cdev->trigger_data = NULL;
 	kfree(tdata);
+
+	return 0;
 }
 
 static void transient_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index d5b4a2b44ab8..de310621b8e7 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -959,7 +959,7 @@ struct kbd_led_trigger {
 	unsigned int mask;
 };
 
-static void kbd_led_trigger_activate(struct led_classdev *cdev)
+static int kbd_led_trigger_activate(struct led_classdev *cdev)
 {
 	struct kbd_led_trigger *trigger =
 		container_of(cdev->trigger, struct kbd_led_trigger, trigger);
@@ -970,6 +970,8 @@ static void kbd_led_trigger_activate(struct led_classdev *cdev)
 				  ledstate & trigger->mask ?
 					LED_FULL : LED_OFF);
 	tasklet_enable(&keyboard_tasklet);
+
+	return 0;
 }
 
 #define KBD_LED_TRIGGER(_led_bit, _name) {			\
diff --git a/drivers/usb/core/ledtrig-usbport.c b/drivers/usb/core/ledtrig-usbport.c
index d775ffea20c3..aa1d51458da7 100644
--- a/drivers/usb/core/ledtrig-usbport.c
+++ b/drivers/usb/core/ledtrig-usbport.c
@@ -298,14 +298,14 @@ static int usbport_trig_notify(struct notifier_block *nb, unsigned long action,
 	return NOTIFY_DONE;
 }
 
-static void usbport_trig_activate(struct led_classdev *led_cdev)
+static int usbport_trig_activate(struct led_classdev *led_cdev)
 {
 	struct usbport_trig_data *usbport_data;
 	int err;
 
 	usbport_data = kzalloc(sizeof(*usbport_data), GFP_KERNEL);
 	if (!usbport_data)
-		return;
+		return 0;
 	usbport_data->led_cdev = led_cdev;
 
 	/* List of ports */
@@ -322,10 +322,11 @@ static void usbport_trig_activate(struct led_classdev *led_cdev)
 	usb_register_notify(&usbport_data->nb);
 
 	led_cdev->activated = true;
-	return;
+	return 0;
 
 err_free:
 	kfree(usbport_data);
+	return 0;
 }
 
 static void usbport_trig_deactivate(struct led_classdev *led_cdev)
diff --git a/include/linux/leds.h b/include/linux/leds.h
index b7e82550e655..ba5baaaa43bf 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -253,7 +253,7 @@ static inline bool led_sysfs_is_disabled(struct led_classdev *led_cdev)
 struct led_trigger {
 	/* Trigger Properties */
 	const char	 *name;
-	void		(*activate)(struct led_classdev *led_cdev);
+	int		(*activate)(struct led_classdev *led_cdev);
 	void		(*deactivate)(struct led_classdev *led_cdev);
 
 	/* LEDs under control by this trigger (for simple triggers) */
@@ -288,8 +288,8 @@ extern void led_trigger_blink_oneshot(struct led_trigger *trigger,
 				      unsigned long *delay_off,
 				      int invert);
 extern void led_trigger_set_default(struct led_classdev *led_cdev);
-extern void led_trigger_set(struct led_classdev *led_cdev,
-			struct led_trigger *trigger);
+extern int led_trigger_set(struct led_classdev *led_cdev,
+			   struct led_trigger *trigger);
 extern void led_trigger_remove(struct led_classdev *led_cdev);
 
 static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
@@ -334,8 +334,12 @@ static inline void led_trigger_blink_oneshot(struct led_trigger *trigger,
 				      unsigned long *delay_off,
 				      int invert) {}
 static inline void led_trigger_set_default(struct led_classdev *led_cdev) {}
-static inline void led_trigger_set(struct led_classdev *led_cdev,
-				struct led_trigger *trigger) {}
+static inline int led_trigger_set(struct led_classdev *led_cdev,
+				  struct led_trigger *trigger)
+{
+	return 0;
+}
+
 static inline void led_trigger_remove(struct led_classdev *led_cdev) {}
 static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 {
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index cb670b5594eb..6d59a5023231 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -43,7 +43,7 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
 	led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
 }
 
-static void power_activate(struct led_classdev *led_cdev)
+static int power_activate(struct led_classdev *led_cdev)
 {
 	struct hci_basic_led_trigger *htrig;
 	bool powered;
@@ -52,10 +52,12 @@ static void power_activate(struct led_classdev *led_cdev)
 	powered = test_bit(HCI_UP, &htrig->hdev->flags);
 
 	led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+
+	return 0;
 }
 
 static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
-			void (*activate)(struct led_classdev *led_cdev),
+			int (*activate)(struct led_classdev *led_cdev),
 			const char *name)
 {
 	struct hci_basic_led_trigger *htrig;
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
index ba0b507ea691..d6c66fc19716 100644
--- a/net/mac80211/led.c
+++ b/net/mac80211/led.c
@@ -52,13 +52,15 @@ void ieee80211_free_led_names(struct ieee80211_local *local)
 	kfree(local->radio_led.name);
 }
 
-static void ieee80211_tx_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_tx_led_activate(struct led_classdev *led_cdev)
 {
 	struct ieee80211_local *local = container_of(led_cdev->trigger,
 						     struct ieee80211_local,
 						     tx_led);
 
 	atomic_inc(&local->tx_led_active);
+
+	return 0;
 }
 
 static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
@@ -70,13 +72,15 @@ static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
 	atomic_dec(&local->tx_led_active);
 }
 
-static void ieee80211_rx_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_rx_led_activate(struct led_classdev *led_cdev)
 {
 	struct ieee80211_local *local = container_of(led_cdev->trigger,
 						     struct ieee80211_local,
 						     rx_led);
 
 	atomic_inc(&local->rx_led_active);
+
+	return 0;
 }
 
 static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
@@ -88,13 +92,15 @@ static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
 	atomic_dec(&local->rx_led_active);
 }
 
-static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
 {
 	struct ieee80211_local *local = container_of(led_cdev->trigger,
 						     struct ieee80211_local,
 						     assoc_led);
 
 	atomic_inc(&local->assoc_led_active);
+
+	return 0;
 }
 
 static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
@@ -106,13 +112,15 @@ static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
 	atomic_dec(&local->assoc_led_active);
 }
 
-static void ieee80211_radio_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_radio_led_activate(struct led_classdev *led_cdev)
 {
 	struct ieee80211_local *local = container_of(led_cdev->trigger,
 						     struct ieee80211_local,
 						     radio_led);
 
 	atomic_inc(&local->radio_led_active);
+
+	return 0;
 }
 
 static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
@@ -124,13 +132,15 @@ static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
 	atomic_dec(&local->radio_led_active);
 }
 
-static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
 {
 	struct ieee80211_local *local = container_of(led_cdev->trigger,
 						     struct ieee80211_local,
 						     tpt_led);
 
 	atomic_inc(&local->tpt_led_active);
+
+	return 0;
 }
 
 static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev)
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a7a4e6ff9be2..1355f5ca8d22 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -141,13 +141,15 @@ static void rfkill_led_trigger_event(struct rfkill *rfkill)
 		led_trigger_event(trigger, LED_FULL);
 }
 
-static void rfkill_led_trigger_activate(struct led_classdev *led)
+static int rfkill_led_trigger_activate(struct led_classdev *led)
 {
 	struct rfkill *rfkill;
 
 	rfkill = container_of(led->trigger, struct rfkill, led_trigger);
 
 	rfkill_led_trigger_event(rfkill);
+
+	return 0;
 }
 
 const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
-- 
cgit v1.2.3


From a7e7a3156300a7e1982b03cc9cb8fb0c86434c49 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Jul 2018 22:05:22 +0200
Subject: leds: triggers: add device attribute support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As many triggers use device attributes, add support for these in
led_trigger_set which allows simplifying the drivers accordingly.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-triggers.c | 12 ++++++++++++
 include/linux/leds.h        | 11 +++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index a8786f4b3453..3f3e8728d82c 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -127,6 +127,7 @@ int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 		led_stop_software_blink(led_cdev);
 		if (led_cdev->trigger->deactivate)
 			led_cdev->trigger->deactivate(led_cdev);
+		device_remove_groups(led_cdev->dev, led_cdev->trigger->groups);
 		led_cdev->trigger = NULL;
 		led_set_brightness(led_cdev, LED_OFF);
 	}
@@ -143,6 +144,12 @@ int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 
 		if (ret)
 			goto err_activate;
+
+		ret = device_add_groups(led_cdev->dev, trig->groups);
+		if (ret) {
+			dev_err(led_cdev->dev, "Failed to add trigger attributes\n");
+			goto err_add_groups;
+		}
 	}
 
 	if (event) {
@@ -156,7 +163,12 @@ int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 
 	return 0;
 
+err_add_groups:
+
+	if (trig->deactivate)
+		trig->deactivate(led_cdev);
 err_activate:
+
 	led_cdev->trigger = NULL;
 	write_lock_irqsave(&led_cdev->trigger->leddev_list_lock, flags);
 	list_del(&led_cdev->trig_list);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index ba5baaaa43bf..33484a5c7478 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -262,8 +262,19 @@ struct led_trigger {
 
 	/* Link to next registered trigger */
 	struct list_head  next_trig;
+
+	const struct attribute_group **groups;
 };
 
+/*
+ * Currently the attributes in struct led_trigger::groups are added directly to
+ * the LED device. As this might change in the future, the following
+ * macros abstract getting the LED device and its trigger_data from the dev
+ * parameter passed to the attribute accessor functions.
+ */
+#define led_trigger_get_led(dev)	((struct led_classdev *)dev_get_drvdata((dev)))
+#define led_trigger_get_drvdata(dev)	(led_get_trigger_data(led_trigger_get_led(dev)))
+
 ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr,
 			const char *buf, size_t count);
 ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
-- 
cgit v1.2.3


From a0b750768371e410d77b60bcf49c18bd45078d55 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Jul 2018 22:05:24 +0200
Subject: leds: triggers: define module_led_trigger helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helps to simplify modules that provide a simple led_trigger. It's
inspired by module_platform_driver, module_i2c_driver et al.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 33484a5c7478..a3ee10846a4b 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -326,6 +326,10 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 extern void led_trigger_rename_static(const char *name,
 				      struct led_trigger *trig);
 
+#define module_led_trigger(__led_trigger) \
+	module_driver(__led_trigger, led_trigger_register, \
+		      led_trigger_unregister)
+
 #else
 
 /* Trigger has no members */
-- 
cgit v1.2.3


From 9acc560de2aac73ef99c54f0fdfb86b4684296b5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Jul 2018 22:05:25 +0200
Subject: leds: triggers: new function led_set_trigger_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the natural counter part to the already existing
led_get_trigger_data().

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index a3ee10846a4b..834683d603f9 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -303,6 +303,12 @@ extern int led_trigger_set(struct led_classdev *led_cdev,
 			   struct led_trigger *trigger);
 extern void led_trigger_remove(struct led_classdev *led_cdev);
 
+static inline void led_set_trigger_data(struct led_classdev *led_cdev,
+					void *trigger_data)
+{
+	led_cdev->trigger_data = trigger_data;
+}
+
 static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 {
 	return led_cdev->trigger_data;
@@ -356,6 +362,7 @@ static inline int led_trigger_set(struct led_classdev *led_cdev,
 }
 
 static inline void led_trigger_remove(struct led_classdev *led_cdev) {}
+static inline void led_set_trigger_data(struct led_classdev *led_cdev) {}
 static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 {
 	return NULL;
-- 
cgit v1.2.3


From 576c7218a1546e0153480b208b125509cec71470 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 25 Jun 2018 13:17:41 -0500
Subject: PCI: Export pcie_get_speed_cap and pcie_get_width_cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So drivers can use them.  This can be used to replace
duplicate code in the drm subsystem.

Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/pci/pci.c   | 2 ++
 include/linux/pci.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 97acba712e4e..22adaf35b136 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5222,6 +5222,7 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
 
 	return PCI_SPEED_UNKNOWN;
 }
+EXPORT_SYMBOL(pcie_get_speed_cap);
 
 /**
  * pcie_get_width_cap - query for the PCI device's link width capability
@@ -5240,6 +5241,7 @@ enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev)
 
 	return PCIE_LNK_WIDTH_UNKNOWN;
 }
+EXPORT_SYMBOL(pcie_get_width_cap);
 
 /**
  * pcie_bandwidth_capable - calculate a PCI device's link bandwidth capability
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..6e0c0803b241 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -261,6 +261,9 @@ enum pci_bus_speed {
 	PCI_SPEED_UNKNOWN		= 0xff,
 };
 
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
+enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
+
 struct pci_cap_saved_data {
 	u16		cap_nr;
 	bool		cap_extended;
-- 
cgit v1.2.3


From 1b0707a781eeaeeaacb464e18bb3f049b99b496b Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 7 Jun 2018 19:50:38 +0000
Subject: Bluetooth: remove unused bt-nokia-h4p.h header

Nothing in tree use this header which seems a remains of a staging
driver.
This patch remove it.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/platform_data/bt-nokia-h4p.h | 38 ------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 include/linux/platform_data/bt-nokia-h4p.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/bt-nokia-h4p.h b/include/linux/platform_data/bt-nokia-h4p.h
deleted file mode 100644
index 30d169dfadf3..000000000000
--- a/include/linux/platform_data/bt-nokia-h4p.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of Nokia H4P bluetooth driver
- *
- * Copyright (C) 2010 Nokia Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
- */
-
-
-/**
- * struct hci_h4p_platform data - hci_h4p Platform data structure
- */
-struct hci_h4p_platform_data {
-	int chip_type;
-	int bt_sysclk;
-	unsigned int bt_wakeup_gpio;
-	unsigned int host_wakeup_gpio;
-	unsigned int reset_gpio;
-	int reset_gpio_shared;
-	unsigned int uart_irq;
-	phys_addr_t uart_base;
-	const char *uart_iclk;
-	const char *uart_fclk;
-	void (*set_pm_limits)(struct device *dev, bool set);
-};
-- 
cgit v1.2.3


From 06ae48269d1e0324d806fca30fe77112f4a4a14a Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 6 Jul 2018 15:13:18 -0700
Subject: lib: reciprocal_div: implement the improved algorithm on the paper
 mentioned

The new added "reciprocal_value_adv" implements the advanced version of the
algorithm described in Figure 4.2 of the paper except when
"divisor > (1U << 31)" whose ceil(log2(d)) result will be 32 which then
requires u128 divide on host. The exception case could be easily handled
before calling "reciprocal_value_adv".

The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations.

It makes no sense to use this advanced version for host divide emulation,
those extra complexities for calculating multiplier etc could completely
waive our saving on emulation operations.

However, it makes sense to use it for JIT divide code generation (for
example eBPF JIT backends) for which we are willing to trade performance of
JITed code with that of host. As shown by the following pseudo code, the
required emulation operations could go down from 6 (the basic version) to 3
or 4.

To use the result of "reciprocal_value_adv", suppose we want to calculate
n/d, the C-style pseudo code will be the following, it could be easily
changed to real code generation for other JIT targets.

  struct reciprocal_value_adv rvalue;
  u8 pre_shift, exp;

  // handle exception case.
  if (d >= (1U << 31)) {
    result = n >= d;
    return;
  }
  rvalue = reciprocal_value_adv(d, 32)
  exp = rvalue.exp;
  if (rvalue.is_wide_m && !(d & 1)) {
    // floor(log2(d & (2^32 -d)))
    pre_shift = fls(d & -d) - 1;
    rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  } else {
    pre_shift = 0;
  }

  // code generation starts.
  if (imm == 1U << exp) {
    result = n >> exp;
  } else if (rvalue.is_wide_m) {
    // pre_shift must be zero when reached here.
    t = (n * rvalue.m) >> 32;
    result = n - t;
    result >>= 1;
    result += t;
    result >>= rvalue.sh - 1;
  } else {
    if (pre_shift)
      result = n >> pre_shift;
    result = ((u64)result * rvalue.m) >> 32;
    result >>= rvalue.sh;
  }

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/reciprocal_div.h | 68 ++++++++++++++++++++++++++++++++++++++++++
 lib/reciprocal_div.c           | 41 +++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index e031e9f2f9d8..585ce89c0f33 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -25,6 +25,9 @@ struct reciprocal_value {
 	u8 sh1, sh2;
 };
 
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
+ * version of the algorithm described in Figure 4.1 of the paper.
+ */
 struct reciprocal_value reciprocal_value(u32 d);
 
 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
@@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }
 
+struct reciprocal_value_adv {
+	u32 m;
+	u8 sh, exp;
+	bool is_wide_m;
+};
+
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
+ * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
+ * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
+ * exception case could be easily handled before calling "reciprocal_value_adv".
+ *
+ * The advanced version requires more complex calculation to get the reciprocal
+ * multiplier and other control variables, but then could reduce the required
+ * emulation operations.
+ *
+ * It makes no sense to use this advanced version for host divide emulation,
+ * those extra complexities for calculating multiplier etc could completely
+ * waive our saving on emulation operations.
+ *
+ * However, it makes sense to use it for JIT divide code generation for which
+ * we are willing to trade performance of JITed code with that of host. As shown
+ * by the following pseudo code, the required emulation operations could go down
+ * from 6 (the basic version) to 3 or 4.
+ *
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
+ * n/d, the pseudo C code will be:
+ *
+ *   struct reciprocal_value_adv rvalue;
+ *   u8 pre_shift, exp;
+ *
+ *   // handle exception case.
+ *   if (d >= (1U << 31)) {
+ *     result = n >= d;
+ *     return;
+ *   }
+ *
+ *   rvalue = reciprocal_value_adv(d, 32)
+ *   exp = rvalue.exp;
+ *   if (rvalue.is_wide_m && !(d & 1)) {
+ *     // floor(log2(d & (2^32 -d)))
+ *     pre_shift = fls(d & -d) - 1;
+ *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
+ *   } else {
+ *     pre_shift = 0;
+ *   }
+ *
+ *   // code generation starts.
+ *   if (imm == 1U << exp) {
+ *     result = n >> exp;
+ *   } else if (rvalue.is_wide_m) {
+ *     // pre_shift must be zero when reached here.
+ *     t = (n * rvalue.m) >> 32;
+ *     result = n - t;
+ *     result >>= 1;
+ *     result += t;
+ *     result >>= rvalue.sh - 1;
+ *   } else {
+ *     if (pre_shift)
+ *       result = n >> pre_shift;
+ *     result = ((u64)result * rvalue.m) >> 32;
+ *     result >>= rvalue.sh;
+ *   }
+ */
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
+
 #endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index fcb4ce682c6f..bf043258fa00 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <asm/div64.h>
 #include <linux/reciprocal_div.h>
@@ -26,3 +27,43 @@ struct reciprocal_value reciprocal_value(u32 d)
 	return R;
 }
 EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+	struct reciprocal_value_adv R;
+	u32 l, post_shift;
+	u64 mhigh, mlow;
+
+	/* ceil(log2(d)) */
+	l = fls(d - 1);
+	/* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
+	 * be handled before calling "reciprocal_value_adv", please see the
+	 * comment at include/linux/reciprocal_div.h.
+	 */
+	WARN(l == 32,
+	     "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
+	     d, __func__);
+	post_shift = l;
+	mlow = 1ULL << (32 + l);
+	do_div(mlow, d);
+	mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+	do_div(mhigh, d);
+
+	for (; post_shift > 0; post_shift--) {
+		u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+		if (lo >= hi)
+			break;
+
+		mlow = lo;
+		mhigh = hi;
+	}
+
+	R.m = (u32)mhigh;
+	R.sh = post_shift;
+	R.exp = l;
+	R.is_wide_m = mhigh > U32_MAX;
+
+	return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);
-- 
cgit v1.2.3


From b233504033dbd65740e59681820ccfd0a2a8ec53 Mon Sep 17 00:00:00 2001
From: Yifeng Sun <pkusunyifeng@gmail.com>
Date: Mon, 2 Jul 2018 08:18:03 -0700
Subject: openvswitch: kernel datapath clone action

Add 'clone' action to kernel datapath by using existing functions.
When actions within clone don't modify the current flow, the flow
key is not cloned before executing clone actions.

This is a follow up patch for this incomplete work:
https://patchwork.ozlabs.org/patch/722096/

v1 -> v2:
Refactor as advised by reviewer.

Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com>
Signed-off-by: Andy Zhou <azhou@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/openvswitch.h      |  5 +++
 include/uapi/linux/openvswitch.h |  3 ++
 net/openvswitch/actions.c        | 33 ++++++++++++++++++
 net/openvswitch/flow_netlink.c   | 73 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index e6b240b6196c..379affc63e24 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -21,4 +21,9 @@
 
 #include <uapi/linux/openvswitch.h>
 
+#define OVS_CLONE_ATTR_EXEC      0   /* Specify an u32 value. When nonzero,
+				      * actions in clone will not change flow
+				      * keys. False otherwise.
+				      */
+
 #endif /* _LINUX_OPENVSWITCH_H */
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 863aabaa5cc9..dbe0cbe4f1b7 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -840,6 +840,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
  * packet, or modify the packet (e.g., change the DSCP field).
+ * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of
+ * actions without affecting the original packet and key.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -873,6 +875,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
+	OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 30a5df27116e..85ae53d8fd09 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
 			     clone_flow_key);
 }
 
+/* When 'last' is true, clone() should always consume the 'skb'.
+ * Otherwise, clone() should keep 'skb' intact regardless what
+ * actions are executed within clone().
+ */
+static int clone(struct datapath *dp, struct sk_buff *skb,
+		 struct sw_flow_key *key, const struct nlattr *attr,
+		 bool last)
+{
+	struct nlattr *actions;
+	struct nlattr *clone_arg;
+	int rem = nla_len(attr);
+	bool dont_clone_flow_key;
+
+	/* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+	clone_arg = nla_data(attr);
+	dont_clone_flow_key = nla_get_u32(clone_arg);
+	actions = nla_next(clone_arg, &rem);
+
+	return clone_execute(dp, skb, key, 0, actions, rem, last,
+			     !dont_clone_flow_key);
+}
+
 static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
 			 const struct nlattr *attr)
 {
@@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 				consume_skb(skb);
 				return 0;
 			}
+			break;
+
+		case OVS_ACTION_ATTR_CLONE: {
+			bool last = nla_is_last(a, rem);
+
+			err = clone(dp, skb, key, a, last);
+			if (last)
+				return err;
+
+			break;
+		}
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 391c4073a6dc..a70097ecf33c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
 	return 0;
 }
 
+static int validate_and_copy_clone(struct net *net,
+				   const struct nlattr *attr,
+				   const struct sw_flow_key *key,
+				   struct sw_flow_actions **sfa,
+				   __be16 eth_type, __be16 vlan_tci,
+				   bool log, bool last)
+{
+	int start, err;
+	u32 exec;
+
+	if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
+		return -EINVAL;
+
+	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
+	if (start < 0)
+		return start;
+
+	exec = last || !actions_may_change_flow(attr);
+
+	err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
+				 sizeof(exec), log);
+	if (err)
+		return err;
+
+	err = __ovs_nla_copy_actions(net, attr, key, sfa,
+				     eth_type, vlan_tci, log);
+	if (err)
+		return err;
+
+	add_nested_action_end(*sfa, start);
+
+	return 0;
+}
+
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key,
 		    bool reset_key,
@@ -2849,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
 			[OVS_ACTION_ATTR_POP_NSH] = 0,
 			[OVS_ACTION_ATTR_METER] = sizeof(u32),
+			[OVS_ACTION_ATTR_CLONE] = (u32)-1,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -3038,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			/* Non-existent meters are simply ignored.  */
 			break;
 
+		case OVS_ACTION_ATTR_CLONE: {
+			bool last = nla_is_last(a, rem);
+
+			err = validate_and_copy_clone(net, a, key, sfa,
+						      eth_type, vlan_tci,
+						      log, last);
+			if (err)
+				return err;
+			skip_copy = true;
+			break;
+		}
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
@@ -3116,6 +3163,26 @@ out:
 	return err;
 }
 
+static int clone_action_to_attr(const struct nlattr *attr,
+				struct sk_buff *skb)
+{
+	struct nlattr *start;
+	int err = 0, rem = nla_len(attr);
+
+	start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE);
+	if (!start)
+		return -EMSGSIZE;
+
+	err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+
+	if (err)
+		nla_nest_cancel(skb, start);
+	else
+		nla_nest_end(skb, start);
+
+	return err;
+}
+
 static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 {
 	const struct nlattr *ovs_key = nla_data(a);
@@ -3204,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
 				return err;
 			break;
 
+		case OVS_ACTION_ATTR_CLONE:
+			err = clone_action_to_attr(a, skb);
+			if (err)
+				return err;
+			break;
+
 		default:
 			if (nla_put(skb, type, nla_len(a), nla_data(a)))
 				return -EMSGSIZE;
-- 
cgit v1.2.3


From 69448867abcb231afaa7891ff9d9fd04b2b94a0d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 5 Jul 2018 09:25:43 +0300
Subject: fs: shave 8 bytes off of struct inode

Here is a link to Linus' reply to Jan's concern about making
i_blkbibts byte addressable:
https://marc.info/?l=linux-fsdevel&m=152882624707975&w=2

Here is a link to an lkp.org report about potential performance
improvement in some workload, which could(?) be related to packing
i_blkbits closer to i_bytes/i_lock:
https://marc.info/?l=linux-fsdevel&m=153077048108198&w=2

Changes since v1:
- Add links to relevant discussions

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f140c11d35dd..9b00676f5d9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -275,6 +275,7 @@ struct writeback_control;
 
 /*
  * Write life time hint values.
+ * Stored in struct inode as u8.
  */
 enum rw_hint {
 	WRITE_LIFE_NOT_SET	= 0,
@@ -609,8 +610,8 @@ struct inode {
 	struct timespec64	i_ctime;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
-	unsigned int		i_blkbits;
-	enum rw_hint		i_write_hint;
+	u8			i_blkbits;
+	u8			i_write_hint;
 	blkcnt_t		i_blocks;
 
 #ifdef __NEED_I_SIZE_ORDERED
-- 
cgit v1.2.3


From 5d6be70add65e3f236642ab0029e356261617cd0 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 29 Jun 2018 13:04:31 +0200
Subject: PM / Domains: Introduce option to attach a device by name to genpd

For the multiple PM domain case, let's introduce a new function called
genpd_dev_pm_attach_by_name(). This allows a device to be associated with
its PM domain through genpd, by using a name based lookup.

Note that, genpd_dev_pm_attach_by_name() shall only be called by the driver
core / PM core, similar to how the existing dev_pm_domain_attach_by_id()
makes use of genpd_dev_pm_attach_by_id(). However, this is implemented by
following changes on top.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 24 ++++++++++++++++++++++++
 include/linux/pm_domain.h   |  8 ++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9e8484189034..79bdca70a81a 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2374,6 +2374,30 @@ struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach_by_id);
 
+/**
+ * genpd_dev_pm_attach_by_name - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @name: The name of the PM domain.
+ *
+ * Parse device's OF node to find a PM domain specifier using the
+ * power-domain-names DT property. For further description see
+ * genpd_dev_pm_attach_by_id().
+ */
+struct device *genpd_dev_pm_attach_by_name(struct device *dev, char *name)
+{
+	int index;
+
+	if (!dev->of_node)
+		return NULL;
+
+	index = of_property_match_string(dev->of_node, "power-domain-names",
+					 name);
+	if (index < 0)
+		return NULL;
+
+	return genpd_dev_pm_attach_by_id(dev, index);
+}
+
 static const struct of_device_id idle_state_match[] = {
 	{ .compatible = "domain-idle-state", },
 	{ }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index cb8d84090cfb..03e14a38462d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -239,6 +239,8 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev,
 int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 					 unsigned int index);
+struct device *genpd_dev_pm_attach_by_name(struct device *dev,
+					   char *name);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
 static inline int of_genpd_add_provider_simple(struct device_node *np,
 					struct generic_pm_domain *genpd)
@@ -290,6 +292,12 @@ static inline struct device *genpd_dev_pm_attach_by_id(struct device *dev,
 	return NULL;
 }
 
+static inline struct device *genpd_dev_pm_attach_by_name(struct device *dev,
+							 char *name)
+{
+	return NULL;
+}
+
 static inline
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 {
-- 
cgit v1.2.3


From 27dceb81f445c58b1d10d27d26eaf4ac2e9e0b00 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 29 Jun 2018 13:04:32 +0200
Subject: PM / Domains: Introduce dev_pm_domain_attach_by_name()

For the multiple PM domain case, let's introduce a new API called
dev_pm_domain_attach_by_name(). This allows a consumer driver to associate
its device with one of its PM domains, by using a name based lookup.

Do note that, currently it's only genpd that supports multiple PM domains
per device, but dev_pm_domain_attach_by_name() can easily by extended to
cover other PM domain types, if/when needed.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 17 +++++++++++++++++
 include/linux/pm_domain.h   |  7 +++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index df41b4780b3b..b413951c6abc 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -152,6 +152,23 @@ struct device *dev_pm_domain_attach_by_id(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id);
 
+/**
+ * dev_pm_domain_attach_by_name - Associate a device with one of its PM domains.
+ * @dev: The device used to lookup the PM domain.
+ * @name: The name of the PM domain.
+ *
+ * For a detailed function description, see dev_pm_domain_attach_by_id().
+ */
+struct device *dev_pm_domain_attach_by_name(struct device *dev,
+					    char *name)
+{
+	if (dev->pm_domain)
+		return ERR_PTR(-EEXIST);
+
+	return genpd_dev_pm_attach_by_name(dev, name);
+}
+EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name);
+
 /**
  * dev_pm_domain_detach - Detach a device from its PM domain.
  * @dev: Device to detach.
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 03e14a38462d..776c546d581a 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -309,6 +309,8 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 int dev_pm_domain_attach(struct device *dev, bool power_on);
 struct device *dev_pm_domain_attach_by_id(struct device *dev,
 					  unsigned int index);
+struct device *dev_pm_domain_attach_by_name(struct device *dev,
+					    char *name);
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 #else
@@ -321,6 +323,11 @@ static inline struct device *dev_pm_domain_attach_by_id(struct device *dev,
 {
 	return NULL;
 }
+static inline struct device *dev_pm_domain_attach_by_name(struct device *dev,
+							  char *name)
+{
+	return NULL;
+}
 static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {}
 static inline void dev_pm_domain_set(struct device *dev,
 				     struct dev_pm_domain *pd) {}
-- 
cgit v1.2.3


From e88728f46cfbb59cc7e7acf1d230c05ec093764e Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Wed, 27 Jun 2018 18:20:55 +0530
Subject: driver core: Rename flag AUTOREMOVE to AUTOREMOVE_CONSUMER

Now that we want to add another flag to autoremove the device link
on supplier unbind, it's fair to rename the existing flag from
DL_FLAG_AUTOREMOVE to DL_FLAG_AUTOREMOVE_CONSUMER so that we can
add similar flag for supplier later.
And, while we are touching device.h, fix a doc build warning.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/device_link.rst |  8 ++++----
 drivers/base/core.c                      | 15 ++++++++-------
 drivers/gpu/drm/tegra/dc.c               |  2 +-
 drivers/gpu/ipu-v3/ipu-pre.c             |  3 ++-
 drivers/gpu/ipu-v3/ipu-prg.c             |  3 ++-
 drivers/soc/imx/gpc.c                    |  2 +-
 include/linux/device.h                   | 12 ++++++------
 7 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/device_link.rst b/Documentation/driver-api/device_link.rst
index 70e328e16aad..a005b904a264 100644
--- a/Documentation/driver-api/device_link.rst
+++ b/Documentation/driver-api/device_link.rst
@@ -81,10 +81,10 @@ integration is desired.
 Two other flags are specifically targeted at use cases where the device
 link is added from the consumer's ``->probe`` callback:  ``DL_FLAG_RPM_ACTIVE``
 can be specified to runtime resume the supplier upon addition of the
-device link.  ``DL_FLAG_AUTOREMOVE`` causes the device link to be automatically
-purged when the consumer fails to probe or later unbinds.  This obviates
-the need to explicitly delete the link in the ``->remove`` callback or in
-the error path of the ``->probe`` callback.
+device link.  ``DL_FLAG_AUTOREMOVE_CONSUMER`` causes the device link to be
+automatically purged when the consumer fails to probe or later unbinds.
+This obviates the need to explicitly delete the link in the ``->remove``
+callback or in the error path of the ``->probe`` callback.
 
 Limitations
 ===========
diff --git a/drivers/base/core.c b/drivers/base/core.c
index df3e1a44707a..14c1e3151e08 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -178,10 +178,10 @@ void device_pm_move_to_tail(struct device *dev)
  * of the link.  If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
  * ignored.
  *
- * If the DL_FLAG_AUTOREMOVE is set, the link will be removed automatically
- * when the consumer device driver unbinds from it.  The combination of both
- * DL_FLAG_AUTOREMOVE and DL_FLAG_STATELESS set is invalid and will cause NULL
- * to be returned.
+ * If the DL_FLAG_AUTOREMOVE_CONSUMER is set, the link will be removed
+ * automatically when the consumer device driver unbinds from it.
+ * The combination of both DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_STATELESS
+ * set is invalid and will cause NULL to be returned.
  *
  * A side effect of the link creation is re-ordering of dpm_list and the
  * devices_kset list by moving the consumer device and all devices depending
@@ -198,7 +198,8 @@ struct device_link *device_link_add(struct device *consumer,
 	struct device_link *link;
 
 	if (!consumer || !supplier ||
-	    ((flags & DL_FLAG_STATELESS) && (flags & DL_FLAG_AUTOREMOVE)))
+	    ((flags & DL_FLAG_STATELESS) &&
+	     (flags & DL_FLAG_AUTOREMOVE_CONSUMER)))
 		return NULL;
 
 	device_links_write_lock();
@@ -479,7 +480,7 @@ static void __device_links_no_driver(struct device *dev)
 		if (link->flags & DL_FLAG_STATELESS)
 			continue;
 
-		if (link->flags & DL_FLAG_AUTOREMOVE)
+		if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
 			kref_put(&link->kref, __device_link_del);
 		else if (link->status != DL_STATE_SUPPLIER_UNBIND)
 			WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
@@ -515,7 +516,7 @@ void device_links_driver_cleanup(struct device *dev)
 		if (link->flags & DL_FLAG_STATELESS)
 			continue;
 
-		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE);
+		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
 		WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
 	}
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index c3afe7b2237e..965088afcfad 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -2312,7 +2312,7 @@ static int tegra_dc_couple(struct tegra_dc *dc)
 	 * POWER_CONTROL registers during CRTC enabling.
 	 */
 	if (dc->soc->coupled_pm && dc->pipe == 1) {
-		u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE;
+		u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER;
 		struct device_link *link;
 		struct device *partner;
 
diff --git a/drivers/gpu/ipu-v3/ipu-pre.c b/drivers/gpu/ipu-v3/ipu-pre.c
index 0f70e8847540..2f8db9d62551 100644
--- a/drivers/gpu/ipu-v3/ipu-pre.c
+++ b/drivers/gpu/ipu-v3/ipu-pre.c
@@ -128,7 +128,8 @@ ipu_pre_lookup_by_phandle(struct device *dev, const char *name, int index)
 	list_for_each_entry(pre, &ipu_pre_list, list) {
 		if (pre_node == pre->dev->of_node) {
 			mutex_unlock(&ipu_pre_list_mutex);
-			device_link_add(dev, pre->dev, DL_FLAG_AUTOREMOVE);
+			device_link_add(dev, pre->dev,
+					DL_FLAG_AUTOREMOVE_CONSUMER);
 			of_node_put(pre_node);
 			return pre;
 		}
diff --git a/drivers/gpu/ipu-v3/ipu-prg.c b/drivers/gpu/ipu-v3/ipu-prg.c
index 83f9dd934a5d..38a3a9764e49 100644
--- a/drivers/gpu/ipu-v3/ipu-prg.c
+++ b/drivers/gpu/ipu-v3/ipu-prg.c
@@ -100,7 +100,8 @@ ipu_prg_lookup_by_phandle(struct device *dev, const char *name, int ipu_id)
 	list_for_each_entry(prg, &ipu_prg_list, list) {
 		if (prg_node == prg->dev->of_node) {
 			mutex_unlock(&ipu_prg_list_mutex);
-			device_link_add(dev, prg->dev, DL_FLAG_AUTOREMOVE);
+			device_link_add(dev, prg->dev,
+					DL_FLAG_AUTOREMOVE_CONSUMER);
 			prg->id = ipu_id;
 			of_node_put(prg_node);
 			return prg;
diff --git a/drivers/soc/imx/gpc.c b/drivers/soc/imx/gpc.c
index 32f0748fd067..aa9e65bc965e 100644
--- a/drivers/soc/imx/gpc.c
+++ b/drivers/soc/imx/gpc.c
@@ -202,7 +202,7 @@ static int imx_pgc_power_domain_probe(struct platform_device *pdev)
 			goto genpd_err;
 	}
 
-	device_link_add(dev, dev->parent, DL_FLAG_AUTOREMOVE);
+	device_link_add(dev, dev->parent, DL_FLAG_AUTOREMOVE_CONSUMER);
 
 	return 0;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 055a69dbcd18..3929805cdd59 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -90,7 +90,7 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  * @num_vf:	Called to find out how many virtual functions a device on this
  *		bus supports.
  * @dma_configure:	Called to setup DMA configuration on a device on
-			this bus.
+ *			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
  * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
@@ -784,14 +784,14 @@ enum device_link_state {
  * Device link flags.
  *
  * STATELESS: The core won't track the presence of supplier/consumer drivers.
- * AUTOREMOVE: Remove this link automatically on consumer driver unbind.
+ * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
  * PM_RUNTIME: If set, the runtime PM framework will use this link.
  * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
  */
-#define DL_FLAG_STATELESS	BIT(0)
-#define DL_FLAG_AUTOREMOVE	BIT(1)
-#define DL_FLAG_PM_RUNTIME	BIT(2)
-#define DL_FLAG_RPM_ACTIVE	BIT(3)
+#define DL_FLAG_STATELESS		BIT(0)
+#define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
+#define DL_FLAG_PM_RUNTIME		BIT(2)
+#define DL_FLAG_RPM_ACTIVE		BIT(3)
 
 /**
  * struct device_link - Device link representation.
-- 
cgit v1.2.3


From 1689cac5b32a6db6f812e8063ea418a7cf023d03 Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Wed, 27 Jun 2018 18:20:56 +0530
Subject: driver core: Add flag to autoremove device link on supplier unbind

Add a flag to autoremove the device links on supplier driver
unbind. This obviates the need to explicitly delete the link
in the remove path.
We remove these links only when the supplier's link to its
consumers has gone to DL_STATE_SUPPLIER_UNBIND state.

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Suggested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/driver-api/device_link.rst |  4 ++++
 drivers/base/core.c                      | 10 ++++++++++
 include/linux/device.h                   |  2 ++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/device_link.rst b/Documentation/driver-api/device_link.rst
index a005b904a264..d6763272e747 100644
--- a/Documentation/driver-api/device_link.rst
+++ b/Documentation/driver-api/device_link.rst
@@ -86,6 +86,10 @@ automatically purged when the consumer fails to probe or later unbinds.
 This obviates the need to explicitly delete the link in the ``->remove``
 callback or in the error path of the ``->probe`` callback.
 
+Similarly, when the device link is added from supplier's ``->probe`` callback,
+``DL_FLAG_AUTOREMOVE_SUPPLIER`` causes the device link to be automatically
+purged when the supplier fails to probe or later unbinds.
+
 Limitations
 ===========
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 14c1e3151e08..e721218bf352 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -518,6 +518,16 @@ void device_links_driver_cleanup(struct device *dev)
 
 		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
 		WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
+
+		/*
+		 * autoremove the links between this @dev and its consumer
+		 * devices that are not active, i.e. where the link state
+		 * has moved to DL_STATE_SUPPLIER_UNBIND.
+		 */
+		if (link->status == DL_STATE_SUPPLIER_UNBIND &&
+		    link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
+			kref_put(&link->kref, __device_link_del);
+
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
 	}
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 3929805cdd59..e80920452b49 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -787,11 +787,13 @@ enum device_link_state {
  * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
  * PM_RUNTIME: If set, the runtime PM framework will use this link.
  * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
+ * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
  */
 #define DL_FLAG_STATELESS		BIT(0)
 #define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
 #define DL_FLAG_PM_RUNTIME		BIT(2)
 #define DL_FLAG_RPM_ACTIVE		BIT(3)
+#define DL_FLAG_AUTOREMOVE_SUPPLIER	BIT(4)
 
 /**
  * struct device_link - Device link representation.
-- 
cgit v1.2.3


From 63f3fb8d7ccb813e0a1f2ceedb9a3ebe2685b620 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 2 Jul 2018 15:59:37 -0700
Subject: pinctrl: Document pin_config_group_get() return codes like
 pin_config_get()

The pinconf_generic_dump_one() function makes the assumption that
pin_config_group_get() should return -EINVAL and -ENOTSUPP just like
pin_config_get() does.  Document that so it's more obvious.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 09eb80f2574a..8dd85d302b90 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -28,7 +28,8 @@ struct seq_file;
  *	is not available on this controller this should return -ENOTSUPP
  *	and if it is available but disabled it should return -EINVAL
  * @pin_config_set: configure an individual pin
- * @pin_config_group_get: get configurations for an entire pin group
+ * @pin_config_group_get: get configurations for an entire pin group; should
+ *	return -ENOTSUPP and -EINVAL using the same rules as pin_config_get.
  * @pin_config_group_set: configure all pins in a group
  * @pin_config_dbg_parse_modify: optional debugfs to modify a pin configuration
  * @pin_config_dbg_show: optional debugfs display hook that will provide
-- 
cgit v1.2.3


From 03fc7f9c99c1e7ae2925d459e8487f1a6f199f79 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 27 Jun 2018 16:20:28 +0200
Subject: printk/nmi: Prevent deadlock when accessing the main log buffer in
 NMI

The commit 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI
when logbuf_lock is available") brought back the possible deadlocks
in printk() and NMI.

The check of logbuf_lock is done only in printk_nmi_enter() to prevent
mixed output. But another CPU might take the lock later, enter NMI, and:

      + Both NMIs might be serialized by yet another lock, for example,
	the one in nmi_cpu_backtrace().

      + The other CPU might get stopped in NMI, see smp_send_stop()
	in panic().

The only safe solution is to use trylock when storing the message
into the main log-buffer. It might cause reordering when some lines
go to the main lock buffer directly and others are delayed via
the per-CPU buffer. It means that it is not useful in general.

This patch replaces the problematic NMI deferred context with NMI
direct context. It can be used to mark a code that might produce
many messages in NMI and the risk of losing them is more critical
than problems with eventual reordering.

The context is then used when dumping trace buffers on oops. It was
the primary motivation for the original fix. Also the reordering is
even smaller issue there because some traces have their own time stamps.

Finally, nmi_cpu_backtrace() need not longer be serialized because
it will always us the per-CPU buffers again.

Fixes: 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available")
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20180627142028.11259-1-pmladek@suse.com
To: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h      |  4 ++++
 kernel/printk/internal.h    |  9 ++++++-
 kernel/printk/printk_safe.c | 58 +++++++++++++++++++++++++++++----------------
 kernel/trace/trace.c        |  4 +++-
 lib/nmi_backtrace.c         |  3 ---
 5 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 6d7e800affd8..3ede9f46a494 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -148,9 +148,13 @@ void early_printk(const char *s, ...) { }
 #ifdef CONFIG_PRINTK_NMI
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
+extern void printk_nmi_direct_enter(void);
+extern void printk_nmi_direct_exit(void);
 #else
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_direct_enter(void) { }
+static inline void printk_nmi_direct_exit(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a7d04049af4..0f1898820cba 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -19,11 +19,16 @@
 #ifdef CONFIG_PRINTK
 
 #define PRINTK_SAFE_CONTEXT_MASK	 0x3fffffff
-#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK	 0x40000000
 #define PRINTK_NMI_CONTEXT_MASK		 0x80000000
 
 extern raw_spinlock_t logbuf_lock;
 
+__printf(5, 0)
+int vprintk_store(int facility, int level,
+		  const char *dict, size_t dictlen,
+		  const char *fmt, va_list args);
+
 __printf(1, 0) int vprintk_default(const char *fmt, va_list args);
 __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
 __printf(1, 0) int vprintk_func(const char *fmt, va_list args);
@@ -54,6 +59,8 @@ void __printk_safe_exit(void);
 		local_irq_enable();		\
 	} while (0)
 
+void defer_console_output(void);
+
 #else
 
 __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index d7d091309054..a0a74c533e4b 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -308,24 +308,33 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
 
 void printk_nmi_enter(void)
 {
-	/*
-	 * The size of the extra per-CPU buffer is limited. Use it only when
-	 * the main one is locked. If this CPU is not in the safe context,
-	 * the lock must be taken on another CPU and we could wait for it.
-	 */
-	if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
-	    raw_spin_is_locked(&logbuf_lock)) {
-		this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
-	} else {
-		this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
-	}
+	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
 }
 
 void printk_nmi_exit(void)
 {
-	this_cpu_and(printk_context,
-		     ~(PRINTK_NMI_CONTEXT_MASK |
-		       PRINTK_NMI_DEFERRED_CONTEXT_MASK));
+	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
+
+/*
+ * Marks a code that might produce many messages in NMI context
+ * and the risk of losing them is more critical than eventual
+ * reordering.
+ *
+ * It has effect only when called in NMI context. Then printk()
+ * will try to store the messages into the main logbuf directly
+ * and use the per-CPU buffers only as a fallback when the lock
+ * is not available.
+ */
+void printk_nmi_direct_enter(void)
+{
+	if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+		this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
+}
+
+void printk_nmi_direct_exit(void)
+{
+	this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
 }
 
 #else
@@ -363,6 +372,20 @@ void __printk_safe_exit(void)
 
 __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
+	/*
+	 * Try to use the main logbuf even in NMI. But avoid calling console
+	 * drivers that might have their own locks.
+	 */
+	if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
+	    raw_spin_trylock(&logbuf_lock)) {
+		int len;
+
+		len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+		raw_spin_unlock(&logbuf_lock);
+		defer_console_output();
+		return len;
+	}
+
 	/* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
 	if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
 		return vprintk_nmi(fmt, args);
@@ -371,13 +394,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 	if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
 		return vprintk_safe(fmt, args);
 
-	/*
-	 * Use the main logbuf when logbuf_lock is available in NMI.
-	 * But avoid calling console drivers that might have their own locks.
-	 */
-	if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
-		return vprintk_deferred(fmt, args);
-
 	/* No obstacles. */
 	return vprintk_default(fmt, args);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bcd93031d042..f106ad12f72f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8265,6 +8265,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 	tracing_off();
 
 	local_irq_save(flags);
+	printk_nmi_direct_enter();
 
 	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
@@ -8344,7 +8345,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 	for_each_tracing_cpu(cpu) {
 		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
- 	atomic_dec(&dump_running);
+	atomic_dec(&dump_running);
+	printk_nmi_direct_exit();
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(ftrace_dump);
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 61a6b5aab07e..15ca78e1c7d4 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -87,11 +87,9 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
-	static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		arch_spin_lock(&lock);
 		if (regs && cpu_in_idle(instruction_pointer(regs))) {
 			pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n",
 				cpu, (void *)instruction_pointer(regs));
@@ -102,7 +100,6 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
 			else
 				dump_stack();
 		}
-		arch_spin_unlock(&lock);
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.2.3


From 6b1d83d274486615cc341e410467a98fd9c27c0a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 15 Jun 2018 14:55:19 -0700
Subject: block: Remove bdev_nr_zones()

Remove this function since it has no callers. This function was
introduced in commit 6cc77e9cb080 ("block: introduce zoned block
devices zone write locking").

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matias Bjorling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 79226ca8f80f..49a400afb146 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1639,15 +1639,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
 	return 0;
 }
 
-static inline unsigned int bdev_nr_zones(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (q)
-		return blk_queue_nr_zones(q);
-	return 0;
-}
-
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
-- 
cgit v1.2.3


From 7c8542b7982264226cf94102950343185869b584 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 15 Jun 2018 14:55:20 -0700
Subject: block: Inline blk_queue_nr_zones()

Since the implementation of blk_queue_nr_zones() is trivial and since
it only has a single caller, inline this function.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Matias Bjorling <mb@lightnvm.io>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 2 +-
 include/linux/blkdev.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1c4532e92938..26e1f8e425a8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -214,7 +214,7 @@ static int queue_zone_wlock_show(void *data, struct seq_file *m)
 	if (!q->seq_zones_wlock)
 		return 0;
 
-	for (i = 0; i < blk_queue_nr_zones(q); i++)
+	for (i = 0; i < q->nr_zones; i++)
 		if (test_bit(i, q->seq_zones_wlock))
 			seq_printf(m, "%u\n", i);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49a400afb146..905daa7c647e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -800,11 +800,6 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
-static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
-{
-	return q->nr_zones;
-}
-
 static inline unsigned int blk_queue_zone_no(struct request_queue *q,
 					     sector_t sector)
 {
-- 
cgit v1.2.3


From 6a5ac9846508ad7d1d23881d9d5add35f2e6ae71 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 15 Jun 2018 14:55:21 -0700
Subject: block: Make struct request_queue smaller for CONFIG_BLK_DEV_ZONED=n

Exclude zoned block device members from struct request_queue for
CONFIG_BLK_DEV_ZONED == n. Avoid breaking the build by only building
the code that uses these struct request_queue members if
CONFIG_BLK_DEV_ZONED != n.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Matias Bjorling <mb@lightnvm.io>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig                |  4 ++++
 block/Makefile               |  1 +
 block/blk-mq-debugfs-zoned.c | 24 ++++++++++++++++++++++++
 block/blk-mq-debugfs.c       | 15 ---------------
 block/blk-mq-debugfs.h       |  9 +++++++++
 include/linux/blkdev.h       |  6 ++++++
 6 files changed, 44 insertions(+), 15 deletions(-)
 create mode 100644 block/blk-mq-debugfs-zoned.c

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index eb50fd4977c2..dfe7bc770fc9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -177,6 +177,10 @@ config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
+config BLK_DEBUG_FS_ZONED
+       bool
+       default BLK_DEBUG_FS && BLK_DEV_ZONED
+
 config BLK_SED_OPAL
 	bool "Logic for interfacing with Opal enabled SEDs"
 	---help---
diff --git a/block/Makefile b/block/Makefile
index 6a56303b9925..a8f94cdb75c3 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -34,4 +34,5 @@ obj-$(CONFIG_BLK_MQ_RDMA)	+= blk-mq-rdma.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
+obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
new file mode 100644
index 000000000000..fb2c82c351e4
--- /dev/null
+++ b/block/blk-mq-debugfs-zoned.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include "blk-mq-debugfs.h"
+
+int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+	struct request_queue *q = data;
+	unsigned int i;
+
+	if (!q->seq_zones_wlock)
+		return 0;
+
+	for (i = 0; i < q->nr_zones; i++)
+		if (test_bit(i, q->seq_zones_wlock))
+			seq_printf(m, "%u\n", i);
+
+	return 0;
+}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 26e1f8e425a8..7efe268e4447 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
 	return count;
 }
 
-static int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
-	struct request_queue *q = data;
-	unsigned int i;
-
-	if (!q->seq_zones_wlock)
-		return 0;
-
-	for (i = 0; i < q->nr_zones; i++)
-		if (test_bit(i, q->seq_zones_wlock))
-			seq_printf(m, "%u\n", i);
-
-	return 0;
-}
-
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 	{ "poll_stat", 0400, queue_poll_stat_show },
 	{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index b9d366e57097..a9160be12be0 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
 }
 #endif
 
+#ifdef CONFIG_BLK_DEBUG_FS_ZONED
+int queue_zone_wlock_show(void *data, struct seq_file *m);
+#else
+static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 905daa7c647e..ca5a8b046894 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -592,6 +592,7 @@ struct request_queue {
 
 	struct queue_limits	limits;
 
+#ifdef CONFIG_BLK_DEV_ZONED
 	/*
 	 * Zoned block device information for request dispatch control.
 	 * nr_zones is the total number of zones of the device. This is always
@@ -612,6 +613,7 @@ struct request_queue {
 	unsigned int		nr_zones;
 	unsigned long		*seq_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
+#endif /* CONFIG_BLK_DEV_ZONED */
 
 	/*
 	 * sg stuff
@@ -800,6 +802,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_queue_zone_no(struct request_queue *q,
 					     sector_t sector)
 {
@@ -815,6 +818,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
 		return false;
 	return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
 }
+#endif /* CONFIG_BLK_DEV_ZONED */
 
 static inline bool rq_is_sync(struct request *rq)
 {
@@ -1065,6 +1069,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_rq_zone_no(struct request *rq)
 {
 	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
@@ -1074,6 +1079,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
 {
 	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
 }
+#endif /* CONFIG_BLK_DEV_ZONED */
 
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
-- 
cgit v1.2.3


From 5815839b3ca16bb1d45939270871169f6803a121 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 25 Jun 2018 19:31:47 +0800
Subject: blk-mq: introduce new lock for protecting hctx->dispatch_wait

Now hctx->lock is only acquired when adding hctx->dispatch_wait to
one wait queue, but not held when removing it from the wait queue.

IO hang can be observed easily if SCHED RESTART is disabled, that means
now RESTART exits just for fixing the issue in blk_mq_mark_tag_wait().

This patch fixes the issue by introducing hctx->dispatch_wait_lock and
holding it for removing hctx->dispatch_wait in blk_mq_dispatch_wake(),
since we need to avoid acquiring hctx->lock in irq context.

Fixes: eb619fdb2d4cb8b3d3419 ("blk-mq: fix issue with shared tag queue re-running")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 26 +++++++++++++++++---------
 include/linux/blk-mq.h |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9eee896a3592..df84281f6af6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,7 +998,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
+	spin_lock(&hctx->dispatch_wait_lock);
 	list_del_init(&wait->entry);
+	spin_unlock(&hctx->dispatch_wait_lock);
+
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
@@ -1012,7 +1015,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 				 struct request *rq)
 {
-	struct sbq_wait_state *ws;
+	struct wait_queue_head *wq;
 	wait_queue_entry_t *wait;
 	bool ret;
 
@@ -1035,14 +1038,18 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 	if (!list_empty_careful(&wait->entry))
 		return false;
 
-	spin_lock(&hctx->lock);
+	wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+
+	spin_lock_irq(&wq->lock);
+	spin_lock(&hctx->dispatch_wait_lock);
 	if (!list_empty(&wait->entry)) {
-		spin_unlock(&hctx->lock);
+		spin_unlock(&hctx->dispatch_wait_lock);
+		spin_unlock_irq(&wq->lock);
 		return false;
 	}
 
-	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
-	add_wait_queue(&ws->wait, wait);
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(wq, wait);
 
 	/*
 	 * It's possible that a tag was freed in the window between the
@@ -1051,7 +1058,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 	 */
 	ret = blk_mq_get_driver_tag(rq);
 	if (!ret) {
-		spin_unlock(&hctx->lock);
+		spin_unlock(&hctx->dispatch_wait_lock);
+		spin_unlock_irq(&wq->lock);
 		return false;
 	}
 
@@ -1059,10 +1067,9 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 	 * We got a tag, remove ourselves from the wait queue to ensure
 	 * someone else gets the wakeup.
 	 */
-	spin_lock_irq(&ws->wait.lock);
 	list_del_init(&wait->entry);
-	spin_unlock_irq(&ws->wait.lock);
-	spin_unlock(&hctx->lock);
+	spin_unlock(&hctx->dispatch_wait_lock);
+	spin_unlock_irq(&wq->lock);
 
 	return true;
 }
@@ -2142,6 +2149,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
 	hctx->nr_ctx = 0;
 
+	spin_lock_init(&hctx->dispatch_wait_lock);
 	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
 	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e3147eb74222..ea690254dab7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -39,6 +39,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
+	spinlock_t		dispatch_wait_lock;
 	wait_queue_entry_t	dispatch_wait;
 	atomic_t		wait_index;
 
-- 
cgit v1.2.3


From 97889f9ac24f8d2fc8e703ea7f80c162bab10d4d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 25 Jun 2018 19:31:48 +0800
Subject: blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()

We have to remove synchronize_rcu() from blk_queue_cleanup(),
otherwise long delay can be caused during lun probe. For removing
it, we have to avoid to iterate the set->tag_list in IO path, eg,
blk_mq_sched_restart().

This patch reverts 5b79413946d (Revert "blk-mq: don't handle
TAG_SHARED in restart"). Given we have fixed enough IO hang issue,
and there isn't any reason to restart all queues in one tags any more,
see the following reasons:

1) blk-mq core can deal with shared-tags case well via blk_mq_get_driver_tag(),
which can wake up queues waiting for driver tag.

2) SCSI is a bit special because it may return BLK_STS_RESOURCE if queue,
target or host is ready, but SCSI built-in restart can cover all these well,
see scsi_end_request(), queue will be rerun after any request initiated from
this host/target is completed.

In my test on scsi_debug(8 luns), this patch may improve IOPS by 20% ~ 30%
when running I/O on these 8 luns concurrently.

Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list")
Cc: Omar Sandoval <osandov@fb.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: linux-scsi@vger.kernel.org
Reported-by: Andrew Jones <drjones@redhat.com>
Tested-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 85 +++-----------------------------------------------
 block/blk-mq.c         | 10 ++----
 include/linux/blkdev.h |  2 --
 3 files changed, 7 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 56c493c6cd90..4e027f6108ae 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
 		return;
 
-	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-		struct request_queue *q = hctx->queue;
-
-		if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-			atomic_inc(&q->shared_hctx_restart);
-	} else
-		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return false;
-
-	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-		struct request_queue *q = hctx->queue;
-
-		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-			atomic_dec(&q->shared_hctx_restart);
-	} else
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		return;
+	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
-	return blk_mq_run_hw_queue(hctx, true);
+	blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
@@ -380,68 +367,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return false;
 }
 
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos:    loop cursor.
- * @skip:   the list element that will not be examined. Iteration starts at
- *          @skip->next.
- * @head:   head of the list to examine. This list must have at least one
- *          element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
-	for ((pos) = (skip);						\
-	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
-			(pos)->member.next, typeof(*pos), member) :	\
-	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
-	     (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
-	struct blk_mq_tags *const tags = hctx->tags;
-	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
-	struct request_queue *const queue = hctx->queue, *q;
-	struct blk_mq_hw_ctx *hctx2;
-	unsigned int i, j;
-
-	if (set->flags & BLK_MQ_F_TAG_SHARED) {
-		/*
-		 * If this is 0, then we know that no hardware queues
-		 * have RESTART marked. We're done.
-		 */
-		if (!atomic_read(&queue->shared_hctx_restart))
-			return;
-
-		rcu_read_lock();
-		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
-					   tag_set_list) {
-			queue_for_each_hw_ctx(q, hctx2, i)
-				if (hctx2->tags == tags &&
-				    blk_mq_sched_restart_hctx(hctx2))
-					goto done;
-		}
-		j = hctx->queue_num + 1;
-		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
-			if (j == queue->nr_hw_queues)
-				j = 0;
-			hctx2 = queue->queue_hw_ctx[j];
-			if (hctx2->tags == tags &&
-			    blk_mq_sched_restart_hctx(hctx2))
-				break;
-		}
-done:
-		rcu_read_unlock();
-	} else {
-		blk_mq_sched_restart_hctx(hctx);
-	}
-}
-
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index df84281f6af6..7c6ff13171ef 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2335,15 +2335,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (shared) {
-			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-				atomic_inc(&q->shared_hctx_restart);
+		if (shared)
 			hctx->flags |= BLK_MQ_F_TAG_SHARED;
-		} else {
-			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-				atomic_dec(&q->shared_hctx_restart);
+		else
 			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-		}
 	}
 }
 
@@ -2374,7 +2369,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 		blk_mq_update_tag_set_depth(set, false);
 	}
 	mutex_unlock(&set->tag_list_lock);
-	synchronize_rcu();
 	INIT_LIST_HEAD(&q->tag_set_list);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca5a8b046894..9d05646d5059 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -442,8 +442,6 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
-	atomic_t		shared_hctx_restart;
-
 	struct blk_queue_stats	*stats;
 	struct rq_wb		*rq_wb;
 
-- 
cgit v1.2.3


From 6e768717304bdbe8d2897ca8298f6b58863fdc41 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 3 Jul 2018 09:03:16 -0600
Subject: blk-mq: dequeue request one by one from sw queue if hctx is busy

It won't be efficient to dequeue request one by one from sw queue,
but we have to do that when queue is busy for better merge performance.

This patch takes the Exponential Weighted Moving Average(EWMA) to figure
out if queue is busy, then only dequeue request one by one from sw queue
when queue is busy.

Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Reported-by: Kashyap Desai <kashyap.desai@broadcom.com>
Tested-by: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  9 +++++++++
 block/blk-mq-sched.c   | 11 ++---------
 block/blk-mq.c         | 33 ++++++++++++++++++++++++++++++++-
 include/linux/blk-mq.h |  3 ++-
 4 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7efe268e4447..cb1e6cf7ac48 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -622,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	seq_printf(m, "%u\n", hctx->dispatch_busy);
+	return 0;
+}
+
 static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
 	__acquires(&ctx->lock)
 {
@@ -783,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"queued", 0600, hctx_queued_show, hctx_queued_write},
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
+	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{},
 };
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f3b4b5ceb4d1..fdc129e64cc4 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -206,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		}
 	} else if (has_sched_dispatch) {
 		blk_mq_do_dispatch_sched(hctx);
-	} else if (q->mq_ops->get_budget) {
-		/*
-		 * If we need to get budget before queuing request, we
-		 * dequeue request one by one from sw queue for avoiding
-		 * to mess up I/O merge when dispatch runs out of resource.
-		 *
-		 * TODO: get more budgets, and dequeue more requests in
-		 * one time.
-		 */
+	} else if (hctx->dispatch_busy) {
+		/* dequeue request one by one from sw queue if queue is busy */
 		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 795ba859b16b..850fdd02c385 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1074,6 +1074,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ *   factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+	unsigned int ewma;
+
+	if (hctx->queue->elevator)
+		return;
+
+	ewma = hctx->dispatch_busy;
+
+	if (!ewma && !busy)
+		return;
+
+	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+	if (busy)
+		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+	hctx->dispatch_busy = ewma;
+}
+
 #define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
 
 /*
@@ -1210,8 +1239,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		else if (needs_restart && (ret == BLK_STS_RESOURCE))
 			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 
+		blk_mq_update_dispatch_busy(hctx, true);
 		return false;
-	}
+	} else
+		blk_mq_update_dispatch_busy(hctx, false);
 
 	/*
 	 * If the host/device is unable to accept more work, inform the
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ea690254dab7..d710e92874cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,9 +35,10 @@ struct blk_mq_hw_ctx {
 	struct sbitmap		ctx_map;
 
 	struct blk_mq_ctx	*dispatch_from;
+	unsigned int		dispatch_busy;
 
-	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
+	struct blk_mq_ctx	**ctxs;
 
 	spinlock_t		dispatch_wait_lock;
 	wait_queue_entry_t	dispatch_wait;
-- 
cgit v1.2.3


From 08e18eab0c579ad84399c1899c11899734854eb2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:14:50 -0400
Subject: block: add bi_blkg to the bio for cgroups

Currently io.low uses a bi_cg_private to stash its private data for the
blkg, however other blkcg policies may want to use this as well.  Since
we can get the private data out of the blkg, move this to bi_blkg in the
bio and make it generic, then we can use bio_associate_blkg() to attach
the blkg to the bio.

Theoretically we could simply replace the bi_css with this since we can
get to all the same information from the blkg, however you have to
lookup the blkg, so for example wbc_init_bio() would have to lookup and
possibly allocate the blkg for the css it was trying to attach to the
bio.  This could be problematic and result in us either not attaching
the css at all to the bio, or falling back to the root blkcg if we are
unable to allocate the corresponding blkg.

So for now do this, and in the future if possible we could just replace
the bi_css with bi_blkg and update the helpers to do the correct
translation.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 23 +++++++++++++++++++++++
 block/blk-throttle.c      | 21 +++++++--------------
 include/linux/bio.h       |  1 +
 include/linux/blk_types.h |  2 +-
 4 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 67eff5eddc49..044571538574 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,6 +28,7 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
@@ -2036,6 +2037,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
+/**
+ * bio_associate_blkg - associate a bio with the specified blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * Associate @bio with the blkg specified by @blkg.  This is the queue specific
+ * blkcg information associated with the @bio, a reference will be taken on the
+ * @blkg and will be freed when the bio is freed.
+ */
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+{
+	if (unlikely(bio->bi_blkg))
+		return -EBUSY;
+	blkg_get(blkg);
+	bio->bi_blkg = blkg;
+	return 0;
+}
+
 /**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
@@ -2050,6 +2069,10 @@ void bio_disassociate_task(struct bio *bio)
 		css_put(bio->bi_css);
 		bio->bi_css = NULL;
 	}
+	if (bio->bi_blkg) {
+		blkg_put(bio->bi_blkg);
+		bio->bi_blkg = NULL;
+	}
 }
 
 /**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 63bb261811dd..caaabbe8a7a5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2134,12 +2134,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	if (bio->bi_css) {
-		if (bio->bi_cg_private)
-			blkg_put(tg_to_blkg(bio->bi_cg_private));
-		bio->bi_cg_private = tg;
-		blkg_get(tg_to_blkg(tg));
-	}
+	if (bio->bi_css)
+		bio_associate_blkg(bio, tg_to_blkg(tg));
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
@@ -2287,6 +2283,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 
 void blk_throtl_bio_endio(struct bio *bio)
 {
+	struct blkcg_gq *blkg;
 	struct throtl_grp *tg;
 	u64 finish_time_ns;
 	unsigned long finish_time;
@@ -2294,20 +2291,18 @@ void blk_throtl_bio_endio(struct bio *bio)
 	unsigned long lat;
 	int rw = bio_data_dir(bio);
 
-	tg = bio->bi_cg_private;
-	if (!tg)
+	blkg = bio->bi_blkg;
+	if (!blkg)
 		return;
-	bio->bi_cg_private = NULL;
+	tg = blkg_to_tg(blkg);
 
 	finish_time_ns = ktime_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
 	start_time = bio_issue_time(&bio->bi_issue) >> 10;
 	finish_time = __bio_issue_time(finish_time_ns) >> 10;
-	if (!start_time || finish_time <= start_time) {
-		blkg_put(tg_to_blkg(tg));
+	if (!start_time || finish_time <= start_time)
 		return;
-	}
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
@@ -2336,8 +2331,6 @@ void blk_throtl_bio_endio(struct bio *bio)
 		tg->bio_cnt /= 2;
 		tg->bad_bio_cnt /= 2;
 	}
-
-	blkg_put(tg_to_blkg(tg));
 }
 #endif
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f08f5fe7bd08..a279ba384da9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -555,6 +555,7 @@ do {						\
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3c4f390aea4b..3364d42ebe08 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -179,8 +179,8 @@ struct bio {
 	 */
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
+	struct blkcg_gq		*bi_blkg;
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	void			*bi_cg_private;
 	struct bio_issue	bi_issue;
 #endif
 #endif
-- 
cgit v1.2.3


From c7c98fd37653955d3a17dd4f1fa67aba070096a9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:14:51 -0400
Subject: block: introduce bio_issue_as_root_blkg

Instead of forcing all file systems to get the right context on their
bio's, simply check for REQ_META to see if we need to issue as the root
blkg.  We don't want to force all bio's to have the root blkg associated
with them if REQ_META is set, as some controllers (blk-iolatency) need
to know who the originating cgroup is so it can backcharge them for the
work they are doing.  This helper will make sure that the controllers do
the proper thing wrt the IO priority and backcharging.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6c666fd7de3c..69aa71dc0c04 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -238,6 +238,22 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio.  Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+	return (bio->bi_opf & REQ_META);
+}
+
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
-- 
cgit v1.2.3


From 903d23f0a354f226fa78f1c1c34b60aaf992e812 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:14:52 -0400
Subject: blk-cgroup: allow controllers to output their own stats

blk-iolatency has a few stats that it would like to print out, and
instead of adding a bunch of crap to the generic code just provide a
helper so that controllers can add stuff to the stat line if they want
to.

Hide it behind a boot option since it changes the output of io.stat from
normal, and these stats are only interesting to developers.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 47 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/blk-cgroup.h |  3 +++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb85cb87c40f..7dc6f05cc44b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -50,6 +50,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
+static bool blkcg_debug_stats = false;
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -954,13 +956,25 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 
 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 		const char *dname;
+		char *buf;
 		struct blkg_rwstat rwstat;
 		u64 rbytes, wbytes, rios, wios;
+		size_t size = seq_get_buf(sf, &buf), off = 0;
+		int i;
+		bool has_stats = false;
 
 		dname = blkg_dev_name(blkg);
 		if (!dname)
 			continue;
 
+		/*
+		 * Hooray string manipulation, count is the size written NOT
+		 * INCLUDING THE \0, so size is now count+1 less than what we
+		 * had before, but we want to start writing the next bit from
+		 * the \0 so we only add count to buf.
+		 */
+		off += scnprintf(buf+off, size-off, "%s ", dname);
+
 		spin_lock_irq(blkg->q->queue_lock);
 
 		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
@@ -975,9 +989,33 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 
 		spin_unlock_irq(blkg->q->queue_lock);
 
-		if (rbytes || wbytes || rios || wios)
-			seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
-				   dname, rbytes, wbytes, rios, wios);
+		if (rbytes || wbytes || rios || wios) {
+			has_stats = true;
+			off += scnprintf(buf+off, size-off,
+					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu",
+					 rbytes, wbytes, rios, wios);
+		}
+
+		if (!blkcg_debug_stats)
+			goto next;
+
+		for (i = 0; i < BLKCG_MAX_POLS; i++) {
+			struct blkcg_policy *pol = blkcg_policy[i];
+			size_t written;
+
+			if (!blkg->pd[i] || !pol->pd_stat_fn)
+				continue;
+
+			written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
+			if (written)
+				has_stats = true;
+			off += written;
+		}
+next:
+		if (has_stats) {
+			off += scnprintf(buf+off, size-off, "\n");
+			seq_commit(sf, off);
+		}
 	}
 
 	rcu_read_unlock();
@@ -1547,3 +1585,6 @@ out_unlock:
 	mutex_unlock(&blkcg_pol_register_mutex);
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+
+module_param(blkcg_debug_stats, bool, 0644);
+MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 69aa71dc0c04..b41292726c0f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,6 +148,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
+				      size_t size);
 
 struct blkcg_policy {
 	int				plid;
@@ -167,6 +169,7 @@ struct blkcg_policy {
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
 	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+	blkcg_pol_stat_pd_fn		*pd_stat_fn;
 };
 
 extern struct blkcg blkcg_root;
-- 
cgit v1.2.3


From 0d1e0c7cd5909d6c6aa0957179318e13fcca971a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:14:53 -0400
Subject: blk: introduce REQ_SWAP

Just like REQ_META, it's important to know the IO coming down is swap
in order to guard against potential IO priority inversion issues with
cgroups.  Add REQ_SWAP and use it for all swap IO, and add it to our
bio_issue_as_root_blkg helper.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 2 +-
 include/linux/blk_types.h  | 3 ++-
 mm/page_io.c               | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b41292726c0f..a8f9ba8f33a4 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -254,7 +254,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
  */
 static inline bool bio_issue_as_root_blkg(struct bio *bio)
 {
-	return (bio->bi_opf & REQ_META);
+	return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
 }
 
 /**
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3364d42ebe08..0ffc34c5cc83 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,7 +329,7 @@ enum req_flag_bits {
 
 	/* for driver use */
 	__REQ_DRV,
-
+	__REQ_SWAP,		/* swapping request. */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -351,6 +351,7 @@ enum req_flag_bits {
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
+#define REQ_SWAP		(1ULL << __REQ_SWAP)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..a552cb37e220 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		ret = -ENOMEM;
 		goto out;
 	}
-	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
 	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);
-- 
cgit v1.2.3


From 0d3bd88d54f513723602b361dccfc71639f50779 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Jul 2018 11:14:54 -0400
Subject: swap,blkcg: issue swap io with the appropriate context

For backcharging we need to know who the page belongs to when swapping
it out.  We don't worry about things that do ->rw_page (zram etc) at the
moment, we're only worried about pages that actually go to a block
device.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 24 ++++++++++++++++++++++++
 include/linux/bio.h |  7 +++++++
 mm/page_io.c        |  1 +
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 044571538574..5f84f5c3887b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2015,6 +2015,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
 
 #ifdef CONFIG_BLK_CGROUP
 
+#ifdef CONFIG_MEMCG
+/**
+ * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkcg from @page's owning memcg.  This works like
+ * every other associate function wrt references.
+ */
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+{
+	struct cgroup_subsys_state *blkcg_css;
+
+	if (unlikely(bio->bi_css))
+		return -EBUSY;
+	if (!page->mem_cgroup)
+		return 0;
+	blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+				     &io_cgrp_subsys);
+	bio->bi_css = blkcg_css;
+	return 0;
+}
+#endif /* CONFIG_MEMCG */
+
 /**
  * bio_associate_blkcg - associate a bio with the specified blkcg
  * @bio: target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a279ba384da9..a00dfff51aa5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -553,6 +553,13 @@ do {						\
 #define bio_dev(bio) \
 	disk_devt((bio)->bi_disk)
 
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
+#else
+static inline int bio_associate_blkcg_from_page(struct bio *bio,
+						struct page *page) {  return 0; }
+#endif
+
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
diff --git a/mm/page_io.c b/mm/page_io.c
index a552cb37e220..aafd19ec1db4 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -339,6 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		goto out;
 	}
 	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+	bio_associate_blkcg_from_page(bio, page);
 	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);
-- 
cgit v1.2.3


From d09d8df3a29403693d9d20cc34ed101f2c558e2b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:14:55 -0400
Subject: blkcg: add generic throttling mechanism

Since IO can be issued from literally anywhere it's almost impossible to
do throttling without having some sort of adverse effect somewhere else
in the system because of locking or other dependencies.  The best way to
solve this is to do the throttling when we know we aren't holding any
other kernel resources.  Do this by tracking throttling in a per-blkg
basis, and if we require throttling flag the task that it needs to check
before it returns to user space and possibly sleep there.

This is to address the case where a process is doing work that is
generating IO that can't be throttled, whether that is directly with a
lot of REQ_META IO, or indirectly by allocating so much memory that it
is swamping the disk with REQ_SWAP.  We can't use task_add_work as we
don't want to induce a memory allocation in the IO path, so simply
saving the request queue in the task and flagging it to do the
notify_resume thing achieves the same result without the overhead of a
memory allocation.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c          | 220 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-cgroup.h  |  99 ++++++++++++++++++++
 include/linux/cgroup-defs.h |   3 +
 include/linux/sched.h       |   8 ++
 include/linux/tracehook.h   |   2 +
 5 files changed, 332 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7dc6f05cc44b..d3310ec96c2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		if (!blkcg_debug_stats)
 			goto next;
 
+		if (atomic_read(&blkg->use_delay)) {
+			has_stats = true;
+			off += scnprintf(buf+off, size-off,
+					 " use_delay=%d delay_nsec=%llu",
+					 atomic_read(&blkg->use_delay),
+					(unsigned long long)atomic64_read(&blkg->delay_nsec));
+		}
+
 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
 			struct blkcg_policy *pol = blkcg_policy[i];
 			size_t written;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&blkcg_pol_mutex);
 }
 
+static void blkcg_exit(struct task_struct *tsk)
+{
+	if (tsk->throttle_queue)
+		blk_put_queue(tsk->throttle_queue);
+	tsk->throttle_queue = NULL;
+}
+
 struct cgroup_subsys io_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.dfl_cftypes = blkcg_files,
 	.legacy_cftypes = blkcg_legacy_files,
 	.legacy_name = "blkio",
+	.exit = blkcg_exit,
 #ifdef CONFIG_MEMCG
 	/*
 	 * This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay.  We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+	u64 old = atomic64_read(&blkg->delay_start);
+
+	/*
+	 * We only want to scale down every second.  The idea here is that we
+	 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+	 * time window.  We only want to throttle tasks for recent delay that
+	 * has occurred, in 1 second time windows since that's the maximum
+	 * things can be throttled.  We save the current delay window in
+	 * blkg->last_delay so we know what amount is still left to be charged
+	 * to the blkg from this point onward.  blkg->last_use keeps track of
+	 * the use_delay counter.  The idea is if we're unthrottling the blkg we
+	 * are ok with whatever is happening now, and we can take away more of
+	 * the accumulated delay as we've already throttled enough that
+	 * everybody is happy with their IO latencies.
+	 */
+	if (time_before64(old + NSEC_PER_SEC, now) &&
+	    atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+		u64 cur = atomic64_read(&blkg->delay_nsec);
+		u64 sub = min_t(u64, blkg->last_delay, now - old);
+		int cur_use = atomic_read(&blkg->use_delay);
+
+		/*
+		 * We've been unthrottled, subtract a larger chunk of our
+		 * accumulated delay.
+		 */
+		if (cur_use < blkg->last_use)
+			sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+		/*
+		 * This shouldn't happen, but handle it anyway.  Our delay_nsec
+		 * should only ever be growing except here where we subtract out
+		 * min(last_delay, 1 second), but lord knows bugs happen and I'd
+		 * rather not end up with negative numbers.
+		 */
+		if (unlikely(cur < sub)) {
+			atomic64_set(&blkg->delay_nsec, 0);
+			blkg->last_delay = 0;
+		} else {
+			atomic64_sub(sub, &blkg->delay_nsec);
+			blkg->last_delay = cur - sub;
+		}
+		blkg->last_use = cur_use;
+	}
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay.  This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+	u64 now = ktime_to_ns(ktime_get());
+	u64 exp;
+	u64 delay_nsec = 0;
+	int tok;
+
+	while (blkg->parent) {
+		if (atomic_read(&blkg->use_delay)) {
+			blkcg_scale_delay(blkg, now);
+			delay_nsec = max_t(u64, delay_nsec,
+					   atomic64_read(&blkg->delay_nsec));
+		}
+		blkg = blkg->parent;
+	}
+
+	if (!delay_nsec)
+		return;
+
+	/*
+	 * Let's not sleep for all eternity if we've amassed a huge delay.
+	 * Swapping or metadata IO can accumulate 10's of seconds worth of
+	 * delay, and we want userspace to be able to do _something_ so cap the
+	 * delays at 1 second.  If there's 10's of seconds worth of delay then
+	 * the tasks will be delayed for 1 second for every syscall.
+	 */
+	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+	/*
+	 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+	 * that hasn't landed upstream yet.  Once that stuff is in place we need
+	 * to do a psi_memstall_enter/leave if memdelay is set.
+	 */
+
+	exp = ktime_add_ns(now, delay_nsec);
+	tok = io_schedule_prepare();
+	do {
+		__set_current_state(TASK_KILLABLE);
+		if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+			break;
+	} while (!fatal_signal_pending(current));
+	io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume().  Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything.  This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+	struct request_queue *q = current->throttle_queue;
+	struct cgroup_subsys_state *css;
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	bool use_memdelay = current->use_memdelay;
+
+	if (!q)
+		return;
+
+	current->throttle_queue = NULL;
+	current->use_memdelay = false;
+
+	rcu_read_lock();
+	css = kthread_blkcg();
+	if (css)
+		blkcg = css_to_blkcg(css);
+	else
+		blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+	if (!blkcg)
+		goto out;
+	blkg = blkg_lookup(blkcg, q);
+	if (!blkg)
+		goto out;
+	blkg = blkg_try_get(blkg);
+	if (!blkg)
+		goto out;
+	rcu_read_unlock();
+	blk_put_queue(q);
+
+	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+	blkg_put(blkg);
+	return;
+out:
+	rcu_read_unlock();
+	blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task.  We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point.  This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall.  You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once.  If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+	if (unlikely(current->flags & PF_KTHREAD))
+		return;
+
+	if (!blk_get_queue(q))
+		return;
+
+	if (current->throttle_queue)
+		blk_put_queue(current->throttle_queue);
+	current->throttle_queue = q;
+	if (use_memdelay)
+		current->use_memdelay = use_memdelay;
+	set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation.  This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+	blkcg_scale_delay(blkg, now);
+	atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a8f9ba8f33a4..de57de4831d5 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -136,6 +136,12 @@ struct blkcg_gq {
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
 	struct rcu_head			rcu_head;
+
+	atomic_t			use_delay;
+	atomic64_t			delay_nsec;
+	atomic64_t			delay_start;
+	u64				last_delay;
+	int				last_use;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
+static inline bool blk_cgroup_congested(void)
+{
+	struct cgroup_subsys_state *css;
+	bool ret = false;
+
+	rcu_read_lock();
+	css = kthread_blkcg();
+	if (!css)
+		css = task_css(current, io_cgrp_id);
+	while (css) {
+		if (atomic_read(&css->cgroup->congestion_count)) {
+			ret = true;
+			break;
+		}
+		css = css->parent;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
 /**
  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
  * @return: true if this bio needs to be submitted with the root blkg context.
@@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 	atomic_inc(&blkg->refcnt);
 }
 
+/**
+ * blkg_try_get - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+{
+	if (atomic_inc_not_zero(&blkg->refcnt))
+		return blkg;
+	return NULL;
+}
+
+
 void __blkg_release_rcu(struct rcu_head *rcu);
 
 /**
@@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	return !throtl;
 }
 
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+	if (atomic_add_return(1, &blkg->use_delay) == 1)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	if (old == 0)
+		return 0;
+
+	/*
+	 * We do this song and dance because we can race with somebody else
+	 * adding or removing delay.  If we just did an atomic_dec we'd end up
+	 * negative and we'd already be in trouble.  We need to subtract 1 and
+	 * then check to see if we were the last delay so we can drop the
+	 * congestion count on the cgroup.
+	 */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+		if (cur == old)
+			break;
+		old = cur;
+	}
+
+	if (old == 0)
+		return 0;
+	if (old == 1)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+	return 1;
+}
+
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+	if (!old)
+		return;
+	/* We only want 1 person clearing the congestion count for this blkg. */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
+		if (cur == old) {
+			atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+			break;
+		}
+		old = cur;
+	}
+}
+
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
@@ -753,8 +847,13 @@ struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
+static inline void blkcg_maybe_throttle_current(void) { }
+static inline bool blk_cgroup_congested(void) { return false; }
+
 #ifdef CONFIG_BLOCK
 
+static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
+
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..ff20b677fb9f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -438,6 +438,9 @@ struct cgroup {
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
 
+	/* If there is block congestion on this cgroup. */
+	atomic_t congestion_count;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..c2e993de67ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,6 +734,10 @@ struct task_struct {
 	/* disallow userland-initiated cgroup migration */
 	unsigned			no_cgroup_migration:1;
 #endif
+#ifdef CONFIG_BLK_CGROUP
+	/* to be used once the psi infrastructure lands upstream. */
+	unsigned			use_memdelay:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
@@ -1151,6 +1155,10 @@ struct task_struct {
 	unsigned int			memcg_nr_pages_over_high;
 #endif
 
+#ifdef CONFIG_BLK_CGROUP
+	struct request_queue		*throttle_queue;
+#endif
+
 #ifdef CONFIG_UPROBES
 	struct uprobe_task		*utask;
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 4a8841963c2e..05589a3e37f4 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,7 @@
 #include <linux/security.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
 struct linux_binprm;
 
 /*
@@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 		task_work_run();
 
 	mem_cgroup_handle_over_high();
+	blkcg_maybe_throttle_current();
 }
 
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From 2cf855837b89d92996cf264713f3bed2bf9b0b4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Jul 2018 11:14:56 -0400
Subject: memcontrol: schedule throttling if we are congested

Memory allocations can induce swapping via kswapd or direct reclaim.  If
we are having IO done for us by kswapd and don't actually go into direct
reclaim we may never get scheduled for throttling.  So instead check to
see if our cgroup is congested, and if so schedule the throttling.
Before we return to user space the throttling stuff will only throttle
if we actually required it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/memcontrol.h | 13 +++++++++++++
 include/linux/swap.h       | 11 ++++++++++-
 mm/huge_memory.c           |  6 +++---
 mm/memcontrol.c            | 13 +++++++++++++
 mm/memory.c                | 11 ++++++-----
 mm/shmem.c                 | 10 +++++-----
 mm/swapfile.c              | 31 +++++++++++++++++++++++++++++++
 7 files changed, 81 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..680d3395fc83 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
 			  bool compound);
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
+			  bool compound);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 			      bool lrucare, bool compound);
 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
@@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mem_cgroup_try_charge_delay(struct page *page,
+					      struct mm_struct *mm,
+					      gfp_t gfp_mask,
+					      struct mem_cgroup **memcgp,
+					      bool compound)
+{
+	*memcgp = NULL;
+	return 0;
+}
+
 static inline void mem_cgroup_commit_charge(struct page *page,
 					    struct mem_cgroup *memcg,
 					    bool lrucare, bool compound)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c063443d8638..1a8bd05a335e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 
 	return memcg->swappiness;
 }
-
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
@@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 }
 #endif
 
+#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+					 gfp_t gfp_mask);
+#else
+static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
+						int node, gfp_t gfp_mask)
+{
+}
+#endif
+
 #ifdef CONFIG_MEMCG_SWAP
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
 extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cd7c1a57a14..b87d5b151db2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
 					       vmf->address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
-			     mem_cgroup_try_charge(pages[i], vma->vm_mm,
+			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
 				     GFP_KERNEL, &memcg, false))) {
 			if (pages[i])
 				put_page(pages[i]);
@@ -1312,7 +1312,7 @@ alloc:
 		goto out;
 	}
 
-	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
 					huge_gfp, &memcg, true))) {
 		put_page(new_page);
 		split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5ef320a..64bd28d35388 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5593,6 +5593,19 @@ out:
 	return ret;
 }
 
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
+			  bool compound)
+{
+	struct mem_cgroup *memcg;
+	int ret;
+
+	ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+	memcg = *memcgp;
+	mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+	return ret;
+}
+
 /**
  * mem_cgroup_commit_charge - commit a page charge
  * @page: page to charge
diff --git a/mm/memory.c b/mm/memory.c
index 7206a634270b..dfe80c574282 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2503,7 +2503,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 		cow_user_page(new_page, old_page, vmf->address, vma);
 	}
 
-	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+	if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
 		goto oom_free_new;
 
 	__SetPageUptodate(new_page);
@@ -3003,8 +3003,8 @@ int do_swap_page(struct vm_fault *vmf)
 		goto out_page;
 	}
 
-	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-				&memcg, false)) {
+	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+					&memcg, false)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
 	}
@@ -3165,7 +3165,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
 	if (!page)
 		goto oom;
 
-	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+					false))
 		goto oom_free_page;
 
 	/*
@@ -3661,7 +3662,7 @@ static int do_cow_fault(struct vm_fault *vmf)
 	if (!vmf->cow_page)
 		return VM_FAULT_OOM;
 
-	if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+	if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
 				&vmf->memcg, false)) {
 		put_page(vmf->cow_page);
 		return VM_FAULT_OOM;
diff --git a/mm/shmem.c b/mm/shmem.c
index 2cab84403055..6206ca3510cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
 	 * Charged back to the user (not to caller) when swap account is used.
 	 */
-	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
-			false);
+	error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+					    &memcg, false);
 	if (error)
 		goto out;
 	/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1712,7 +1712,7 @@ repeat:
 				goto failed;
 		}
 
-		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+		error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
 				false);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, index,
@@ -1818,7 +1818,7 @@ alloc_nohuge:		page = shmem_alloc_and_acct_page(gfp, inode,
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
-		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+		error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
 				PageTransHuge(page));
 		if (error)
 			goto unacct;
@@ -2291,7 +2291,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	__SetPageSwapBacked(page);
 	__SetPageUptodate(page);
 
-	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+	ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
 	if (ret)
 		goto out_release;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cc2972eedaf..db4ec8ae1c8c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3731,6 +3731,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 	}
 }
 
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+				  gfp_t gfp_mask)
+{
+	struct swap_info_struct *si, *next;
+	if (!(gfp_mask & __GFP_IO) || !memcg)
+		return;
+
+	if (!blk_cgroup_congested())
+		return;
+
+	/*
+	 * We've already scheduled a throttle, avoid taking the global swap
+	 * lock.
+	 */
+	if (current->throttle_queue)
+		return;
+
+	spin_lock(&swap_avail_lock);
+	plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+				  avail_lists[node]) {
+		if (si->bdev) {
+			blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+						true);
+			break;
+		}
+	}
+	spin_unlock(&swap_avail_lock);
+}
+#endif
+
 static int __init swapfile_init(void)
 {
 	int nid;
-- 
cgit v1.2.3


From a79050434b45959f397042080fd1d70ffa9bd9df Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 09:32:35 -0600
Subject: blk-rq-qos: refactor out common elements of blk-wbt

blkcg-qos is going to do essentially what wbt does, only on a cgroup
basis.  Break out the common code that will be shared between blkcg-qos
and wbt into blk-rq-qos.* so they can both utilize the same
infrastructure.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Makefile         |   2 +-
 block/blk-core.c       |  12 +-
 block/blk-mq.c         |  12 +-
 block/blk-rq-qos.c     | 178 +++++++++++++++++++++++++++
 block/blk-rq-qos.h     | 106 ++++++++++++++++
 block/blk-settings.c   |   4 +-
 block/blk-sysfs.c      |  22 ++--
 block/blk-wbt.c        | 326 ++++++++++++++++++++++---------------------------
 block/blk-wbt.h        |  63 ++++------
 include/linux/blkdev.h |   4 +-
 10 files changed, 478 insertions(+), 251 deletions(-)
 create mode 100644 block/blk-rq-qos.c
 create mode 100644 block/blk-rq-qos.h

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index a8f94cdb75c3..57d0f47ab05f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o partition-generic.o ioprio.o \
-			badblocks.o partitions/
+			badblocks.o partitions/ blk-rq-qos.o
 
 obj-$(CONFIG_BOUNCE)		+= bounce.o
 obj-$(CONFIG_BLK_SCSI_REQUEST)	+= scsi_ioctl.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 2ff8e131a892..b33a73bcf2d0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1645,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, rq);
+	rq_qos_requeue(q, rq);
 
 	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1752,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
-	wbt_done(q->rq_wb, req);
+	rq_qos_done(q, req);
 
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
@@ -2044,7 +2044,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
-	wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+	wb_acct = rq_qos_throttle(q, bio, q->queue_lock);
 
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
@@ -2054,7 +2054,7 @@ get_rq:
 	req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
-		__wbt_done(q->rq_wb, wb_acct);
+		rq_qos_cleanup(q, wb_acct);
 		if (PTR_ERR(req) == -ENOMEM)
 			bio->bi_status = BLK_STS_RESOURCE;
 		else
@@ -2983,7 +2983,7 @@ void blk_start_request(struct request *req)
 		req->throtl_size = blk_rq_sectors(req);
 #endif
 		req->rq_flags |= RQF_STATS;
-		wbt_issue(req->q->rq_wb, req);
+		rq_qos_issue(req->q, req);
 	}
 
 	BUG_ON(blk_rq_is_complete(req));
@@ -3207,7 +3207,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
 	blk_account_io_done(req, now);
 
 	if (req->end_io) {
-		wbt_done(req->q->rq_wb, req);
+		rq_qos_done(q, req);
 		req->end_io(req, error);
 	} else {
 		if (blk_bidi_rq(req))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 850fdd02c385..ea2a226457fa 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -504,7 +504,7 @@ void blk_mq_free_request(struct request *rq)
 	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 		laptop_io_completion(q->backing_dev_info);
 
-	wbt_done(q->rq_wb, rq);
+	rq_qos_done(q, rq);
 
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
@@ -527,7 +527,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 	blk_account_io_done(rq, now);
 
 	if (rq->end_io) {
-		wbt_done(rq->q->rq_wb, rq);
+		rq_qos_done(rq->q, rq);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -641,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
 		rq->throtl_size = blk_rq_sectors(rq);
 #endif
 		rq->rq_flags |= RQF_STATS;
-		wbt_issue(q->rq_wb, rq);
+		rq_qos_issue(q, rq);
 	}
 
 	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@ -667,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_put_driver_tag(rq);
 
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, rq);
+	rq_qos_requeue(q, rq);
 
 	if (blk_mq_request_started(rq)) {
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -1806,13 +1806,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (blk_mq_sched_bio_merge(q, bio))
 		return BLK_QC_T_NONE;
 
-	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+	wb_acct = rq_qos_throttle(q, bio, NULL);
 
 	trace_block_getrq(q, bio, bio->bi_opf);
 
 	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
-		__wbt_done(q->rq_wb, wb_acct);
+		rq_qos_cleanup(q, wb_acct);
 		if (bio->bi_opf & REQ_NOWAIT)
 			bio_wouldblock_error(bio);
 		return BLK_QC_T_NONE;
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
new file mode 100644
index 000000000000..d2f2af8aa10c
--- /dev/null
+++ b/block/blk-rq-qos.c
@@ -0,0 +1,178 @@
+#include "blk-rq-qos.h"
+
+#include "blk-wbt.h"
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+	int cur = atomic_read(v);
+
+	for (;;) {
+		int old;
+
+		if (cur >= below)
+			return false;
+		old = atomic_cmpxchg(v, cur, cur + 1);
+		if (old == cur)
+			break;
+		cur = old;
+	}
+
+	return true;
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit)
+{
+	return atomic_inc_below(&rq_wait->inflight, limit);
+}
+
+void rq_qos_cleanup(struct request_queue *q, enum wbt_flags wb_acct)
+{
+	struct rq_qos *rqos;
+
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->ops->cleanup)
+			rqos->ops->cleanup(rqos, wb_acct);
+	}
+}
+
+void rq_qos_done(struct request_queue *q, struct request *rq)
+{
+	struct rq_qos *rqos;
+
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->ops->done)
+			rqos->ops->done(rqos, rq);
+	}
+}
+
+void rq_qos_issue(struct request_queue *q, struct request *rq)
+{
+	struct rq_qos *rqos;
+
+	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->ops->issue)
+			rqos->ops->issue(rqos, rq);
+	}
+}
+
+void rq_qos_requeue(struct request_queue *q, struct request *rq)
+{
+	struct rq_qos *rqos;
+
+	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->ops->requeue)
+			rqos->ops->requeue(rqos, rq);
+	}
+}
+
+enum wbt_flags rq_qos_throttle(struct request_queue *q, struct bio *bio,
+			       spinlock_t *lock)
+{
+	struct rq_qos *rqos;
+	enum wbt_flags flags = 0;
+
+	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->ops->throttle)
+			flags |= rqos->ops->throttle(rqos, bio, lock);
+	}
+	return flags;
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+bool rq_depth_calc_max_depth(struct rq_depth *rqd)
+{
+	unsigned int depth;
+	bool ret = false;
+
+	/*
+	 * For QD=1 devices, this is a special case. It's important for those
+	 * to have one request ready when one completes, so force a depth of
+	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+	 * since the device can't have more than that in flight. If we're
+	 * scaling down, then keep a setting of 1/1/1.
+	 */
+	if (rqd->queue_depth == 1) {
+		if (rqd->scale_step > 0)
+			rqd->max_depth = 1;
+		else {
+			rqd->max_depth = 2;
+			ret = true;
+		}
+	} else {
+		/*
+		 * scale_step == 0 is our default state. If we have suffered
+		 * latency spikes, step will be > 0, and we shrink the
+		 * allowed write depths. If step is < 0, we're only doing
+		 * writes, and we allow a temporarily higher depth to
+		 * increase performance.
+		 */
+		depth = min_t(unsigned int, rqd->default_depth,
+			      rqd->queue_depth);
+		if (rqd->scale_step > 0)
+			depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
+		else if (rqd->scale_step < 0) {
+			unsigned int maxd = 3 * rqd->queue_depth / 4;
+
+			depth = 1 + ((depth - 1) << -rqd->scale_step);
+			if (depth > maxd) {
+				depth = maxd;
+				ret = true;
+			}
+		}
+
+		rqd->max_depth = depth;
+	}
+
+	return ret;
+}
+
+void rq_depth_scale_up(struct rq_depth *rqd)
+{
+	/*
+	 * Hit max in previous round, stop here
+	 */
+	if (rqd->scaled_max)
+		return;
+
+	rqd->scale_step--;
+
+	rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+{
+	/*
+	 * Stop scaling down when we've hit the limit. This also prevents
+	 * ->scale_step from going to crazy values, if the device can't
+	 * keep up.
+	 */
+	if (rqd->max_depth == 1)
+		return;
+
+	if (rqd->scale_step < 0 && hard_throttle)
+		rqd->scale_step = 0;
+	else
+		rqd->scale_step++;
+
+	rqd->scaled_max = false;
+	rq_depth_calc_max_depth(rqd);
+}
+
+void rq_qos_exit(struct request_queue *q)
+{
+	while (q->rq_qos) {
+		struct rq_qos *rqos = q->rq_qos;
+		q->rq_qos = rqos->next;
+		rqos->ops->exit(rqos);
+	}
+}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
new file mode 100644
index 000000000000..f9a39bd6ece3
--- /dev/null
+++ b/block/blk-rq-qos.h
@@ -0,0 +1,106 @@
+#ifndef RQ_QOS_H
+#define RQ_QOS_H
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+enum rq_qos_id {
+	RQ_QOS_WBT,
+	RQ_QOS_CGROUP,
+};
+
+struct rq_wait {
+	wait_queue_head_t wait;
+	atomic_t inflight;
+};
+
+struct rq_qos {
+	struct rq_qos_ops *ops;
+	struct request_queue *q;
+	enum rq_qos_id id;
+	struct rq_qos *next;
+};
+
+struct rq_qos_ops {
+	enum wbt_flags (*throttle)(struct rq_qos *, struct bio *,
+				   spinlock_t *);
+	void (*issue)(struct rq_qos *, struct request *);
+	void (*requeue)(struct rq_qos *, struct request *);
+	void (*done)(struct rq_qos *, struct request *);
+	void (*cleanup)(struct rq_qos *, enum wbt_flags);
+	void (*exit)(struct rq_qos *);
+};
+
+struct rq_depth {
+	unsigned int max_depth;
+
+	int scale_step;
+	bool scaled_max;
+
+	unsigned int queue_depth;
+	unsigned int default_depth;
+};
+
+static inline struct rq_qos *rq_qos_id(struct request_queue *q,
+				       enum rq_qos_id id)
+{
+	struct rq_qos *rqos;
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->id == id)
+			break;
+	}
+	return rqos;
+}
+
+static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
+{
+	return rq_qos_id(q, RQ_QOS_WBT);
+}
+
+static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
+{
+	return rq_qos_id(q, RQ_QOS_CGROUP);
+}
+
+static inline void rq_wait_init(struct rq_wait *rq_wait)
+{
+	atomic_set(&rq_wait->inflight, 0);
+	init_waitqueue_head(&rq_wait->wait);
+}
+
+static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+{
+	rqos->next = q->rq_qos;
+	q->rq_qos = rqos;
+}
+
+static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+{
+	struct rq_qos *cur, *prev = NULL;
+	for (cur = q->rq_qos; cur; cur = cur->next) {
+		if (cur == rqos) {
+			if (prev)
+				prev->next = rqos->next;
+			else
+				q->rq_qos = cur;
+			break;
+		}
+		prev = cur;
+	}
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit);
+void rq_depth_scale_up(struct rq_depth *rqd);
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_calc_max_depth(struct rq_depth *rqd);
+
+void rq_qos_cleanup(struct request_queue *, enum wbt_flags);
+void rq_qos_done(struct request_queue *, struct request *);
+void rq_qos_issue(struct request_queue *, struct request *);
+void rq_qos_requeue(struct request_queue *, struct request *);
+enum wbt_flags rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
+void rq_qos_exit(struct request_queue *);
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d1de71124656..053de87d1fda 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
 	q->queue_depth = depth;
-	wbt_set_queue_depth(q->rq_wb, depth);
+	wbt_set_queue_depth(q, depth);
 }
 EXPORT_SYMBOL(blk_set_queue_depth);
 
@@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 		queue_flag_clear(QUEUE_FLAG_FUA, q);
 	spin_unlock_irq(q->queue_lock);
 
-	wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+	wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 94987b1f69e1..49c29a5d06bb 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
-	if (!q->rq_wb)
+	if (!wbt_rq_qos(q))
 		return -EINVAL;
 
-	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+	return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
 }
 
 static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 				  size_t count)
 {
-	struct rq_wb *rwb;
+	struct rq_qos *rqos;
 	ssize_t ret;
 	s64 val;
 
@@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 	if (val < -1)
 		return -EINVAL;
 
-	rwb = q->rq_wb;
-	if (!rwb) {
+	rqos = wbt_rq_qos(q);
+	if (!rqos) {
 		ret = wbt_init(q);
 		if (ret)
 			return ret;
 	}
 
-	rwb = q->rq_wb;
 	if (val == -1)
-		rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+		val = wbt_default_latency_nsec(q);
 	else if (val >= 0)
-		rwb->min_lat_nsec = val * 1000ULL;
+		val *= 1000ULL;
 
-	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
-		rwb->enable_state = WBT_STATE_ON_MANUAL;
+	wbt_set_min_lat(q, val);
 
-	wbt_update_limits(rwb);
+	wbt_update_limits(q);
 	return count;
 }
 
@@ -964,7 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
-	wbt_exit(q);
+	rq_qos_exit(q);
 
 	mutex_lock(&q->sysfs_lock);
 	if (q->request_fn || (q->mq_ops && q->elevator))
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4f89b28fa652..6fe20fb823e4 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,6 +25,7 @@
 #include <linux/swap.h>
 
 #include "blk-wbt.h"
+#include "blk-rq-qos.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
@@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
 	return rwb && rwb->wb_normal != 0;
 }
 
-/*
- * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
- * false if 'v' + 1 would be bigger than 'below'.
- */
-static bool atomic_inc_below(atomic_t *v, int below)
-{
-	int cur = atomic_read(v);
-
-	for (;;) {
-		int old;
-
-		if (cur >= below)
-			return false;
-		old = atomic_cmpxchg(v, cur, cur + 1);
-		if (old == cur)
-			break;
-		cur = old;
-	}
-
-	return true;
-}
-
 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
 {
 	if (rwb_enabled(rwb)) {
@@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  */
 static bool wb_recent_wait(struct rq_wb *rwb)
 {
-	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
+	struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
 
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
@@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
 	}
 }
 
-void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
 {
+	struct rq_wb *rwb = RQWB(rqos);
 	struct rq_wait *rqw;
 	int inflight, limit;
 
@@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
  * Called on completion of a request. Note that it's also called when
  * a request is merged, when the request gets freed.
  */
-void wbt_done(struct rq_wb *rwb, struct request *rq)
+static void wbt_done(struct rq_qos *rqos, struct request *rq)
 {
-	if (!rwb)
-		return;
+	struct rq_wb *rwb = RQWB(rqos);
 
 	if (!wbt_is_tracked(rq)) {
 		if (rwb->sync_cookie == rq) {
@@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
 			wb_timestamp(rwb, &rwb->last_comp);
 	} else {
 		WARN_ON_ONCE(rq == rwb->sync_cookie);
-		__wbt_done(rwb, wbt_flags(rq));
+		__wbt_done(rqos, wbt_flags(rq));
 	}
 	wbt_clear_state(rq);
 }
 
-/*
- * Return true, if we can't increase the depth further by scaling
- */
-static bool calc_wb_limits(struct rq_wb *rwb)
-{
-	unsigned int depth;
-	bool ret = false;
-
-	if (!rwb->min_lat_nsec) {
-		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
-		return false;
-	}
-
-	/*
-	 * For QD=1 devices, this is a special case. It's important for those
-	 * to have one request ready when one completes, so force a depth of
-	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
-	 * since the device can't have more than that in flight. If we're
-	 * scaling down, then keep a setting of 1/1/1.
-	 */
-	if (rwb->queue_depth == 1) {
-		if (rwb->scale_step > 0)
-			rwb->wb_max = rwb->wb_normal = 1;
-		else {
-			rwb->wb_max = rwb->wb_normal = 2;
-			ret = true;
-		}
-		rwb->wb_background = 1;
-	} else {
-		/*
-		 * scale_step == 0 is our default state. If we have suffered
-		 * latency spikes, step will be > 0, and we shrink the
-		 * allowed write depths. If step is < 0, we're only doing
-		 * writes, and we allow a temporarily higher depth to
-		 * increase performance.
-		 */
-		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
-		if (rwb->scale_step > 0)
-			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
-		else if (rwb->scale_step < 0) {
-			unsigned int maxd = 3 * rwb->queue_depth / 4;
-
-			depth = 1 + ((depth - 1) << -rwb->scale_step);
-			if (depth > maxd) {
-				depth = maxd;
-				ret = true;
-			}
-		}
-
-		/*
-		 * Set our max/normal/bg queue depths based on how far
-		 * we have scaled down (->scale_step).
-		 */
-		rwb->wb_max = depth;
-		rwb->wb_normal = (rwb->wb_max + 1) / 2;
-		rwb->wb_background = (rwb->wb_max + 3) / 4;
-	}
-
-	return ret;
-}
-
 static inline bool stat_sample_valid(struct blk_rq_stat *stat)
 {
 	/*
@@ -307,7 +225,8 @@ enum {
 
 static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
-	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+	struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+	struct rq_depth *rqd = &rwb->rq_depth;
 	u64 thislat;
 
 	/*
@@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		return LAT_EXCEEDED;
 	}
 
-	if (rwb->scale_step)
+	if (rqd->scale_step)
 		trace_wbt_stat(bdi, stat);
 
 	return LAT_OK;
@@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
-	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+	struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+	struct rq_depth *rqd = &rwb->rq_depth;
 
-	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
-			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+	trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
+			rwb->wb_background, rwb->wb_normal, rqd->max_depth);
 }
 
-static void scale_up(struct rq_wb *rwb)
+static void calc_wb_limits(struct rq_wb *rwb)
 {
-	/*
-	 * Hit max in previous round, stop here
-	 */
-	if (rwb->scaled_max)
-		return;
+	if (rwb->min_lat_nsec == 0) {
+		rwb->wb_normal = rwb->wb_background = 0;
+	} else if (rwb->rq_depth.max_depth <= 2) {
+		rwb->wb_normal = rwb->rq_depth.max_depth;
+		rwb->wb_background = 1;
+	} else {
+		rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
+		rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
+	}
+}
 
-	rwb->scale_step--;
+static void scale_up(struct rq_wb *rwb)
+{
+	rq_depth_scale_up(&rwb->rq_depth);
+	calc_wb_limits(rwb);
 	rwb->unknown_cnt = 0;
-
-	rwb->scaled_max = calc_wb_limits(rwb);
-
-	rwb_wake_all(rwb);
-
-	rwb_trace_step(rwb, "step up");
+	rwb_trace_step(rwb, "scale up");
 }
 
-/*
- * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
- */
 static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 {
-	/*
-	 * Stop scaling down when we've hit the limit. This also prevents
-	 * ->scale_step from going to crazy values, if the device can't
-	 * keep up.
-	 */
-	if (rwb->wb_max == 1)
-		return;
-
-	if (rwb->scale_step < 0 && hard_throttle)
-		rwb->scale_step = 0;
-	else
-		rwb->scale_step++;
-
-	rwb->scaled_max = false;
-	rwb->unknown_cnt = 0;
+	rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
 	calc_wb_limits(rwb);
-	rwb_trace_step(rwb, "step down");
+	rwb->unknown_cnt = 0;
+	rwb_wake_all(rwb);
+	rwb_trace_step(rwb, "scale down");
 }
 
 static void rwb_arm_timer(struct rq_wb *rwb)
 {
-	if (rwb->scale_step > 0) {
+	struct rq_depth *rqd = &rwb->rq_depth;
+
+	if (rqd->scale_step > 0) {
 		/*
 		 * We should speed this up, using some variant of a fast
 		 * integer inverse square root calculation. Since we only do
@@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 		 * though.
 		 */
 		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
-					int_sqrt((rwb->scale_step + 1) << 8));
+					int_sqrt((rqd->scale_step + 1) << 8));
 	} else {
 		/*
 		 * For step < 0, we don't want to increase/decrease the
@@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 static void wb_timer_fn(struct blk_stat_callback *cb)
 {
 	struct rq_wb *rwb = cb->data;
+	struct rq_depth *rqd = &rwb->rq_depth;
 	unsigned int inflight = wbt_inflight(rwb);
 	int status;
 
 	status = latency_exceeded(rwb, cb->stat);
 
-	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
+	trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
 			inflight);
 
 	/*
@@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
 		 * currently don't have a valid read/write sample. For that
 		 * case, slowly return to center state (step == 0).
 		 */
-		if (rwb->scale_step > 0)
+		if (rqd->scale_step > 0)
 			scale_up(rwb);
-		else if (rwb->scale_step < 0)
+		else if (rqd->scale_step < 0)
 			scale_down(rwb, false);
 		break;
 	default:
@@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
 	/*
 	 * Re-arm timer, if we have IO in flight
 	 */
-	if (rwb->scale_step || inflight)
+	if (rqd->scale_step || inflight)
 		rwb_arm_timer(rwb);
 }
 
-void wbt_update_limits(struct rq_wb *rwb)
+static void __wbt_update_limits(struct rq_wb *rwb)
 {
-	rwb->scale_step = 0;
-	rwb->scaled_max = false;
+	struct rq_depth *rqd = &rwb->rq_depth;
+
+	rqd->scale_step = 0;
+	rqd->scaled_max = false;
+
+	rq_depth_calc_max_depth(rqd);
 	calc_wb_limits(rwb);
 
 	rwb_wake_all(rwb);
 }
 
+void wbt_update_limits(struct request_queue *q)
+{
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	if (!rqos)
+		return;
+	__wbt_update_limits(RQWB(rqos));
+}
+
+u64 wbt_get_min_lat(struct request_queue *q)
+{
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	if (!rqos)
+		return 0;
+	return RQWB(rqos)->min_lat_nsec;
+}
+
+void wbt_set_min_lat(struct request_queue *q, u64 val)
+{
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	if (!rqos)
+		return;
+	RQWB(rqos)->min_lat_nsec = val;
+	RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+	__wbt_update_limits(RQWB(rqos));
+}
+
+
 static bool close_io(struct rq_wb *rwb)
 {
 	const unsigned long now = jiffies;
@@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 	 * IO for a bit.
 	 */
 	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
-		limit = rwb->wb_max;
+		limit = rwb->rq_depth.max_depth;
 	else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
 		/*
 		 * If less than 100ms since we completed unrelated IO,
@@ -554,7 +495,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
 	    rqw->wait.head.next != &wait->entry)
 		return false;
 
-	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+	return rq_wait_inc_below(rqw, get_limit(rwb, rw));
 }
 
 /*
@@ -614,8 +555,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
  * in an irq held spinlock, if it holds one when calling this function.
  * If we do sleep, we'll release and re-grab it.
  */
-enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+static enum wbt_flags wbt_wait(struct rq_qos *rqos, struct bio *bio,
+			       spinlock_t *lock)
 {
+	struct rq_wb *rwb = RQWB(rqos);
 	enum wbt_flags ret = 0;
 
 	if (!rwb_enabled(rwb))
@@ -643,8 +586,10 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 	return ret | WBT_TRACKED;
 }
 
-void wbt_issue(struct rq_wb *rwb, struct request *rq)
+void wbt_issue(struct rq_qos *rqos, struct request *rq)
 {
+	struct rq_wb *rwb = RQWB(rqos);
+
 	if (!rwb_enabled(rwb))
 		return;
 
@@ -661,8 +606,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
 	}
 }
 
-void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+void wbt_requeue(struct rq_qos *rqos, struct request *rq)
 {
+	struct rq_wb *rwb = RQWB(rqos);
 	if (!rwb_enabled(rwb))
 		return;
 	if (rq == rwb->sync_cookie) {
@@ -671,39 +617,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
 	}
 }
 
-void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
-	if (rwb) {
-		rwb->queue_depth = depth;
-		wbt_update_limits(rwb);
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	if (rqos) {
+		RQWB(rqos)->rq_depth.queue_depth = depth;
+		__wbt_update_limits(RQWB(rqos));
 	}
 }
 
-void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
-{
-	if (rwb)
-		rwb->wc = write_cache_on;
-}
-
-/*
- * Disable wbt, if enabled by default.
- */
-void wbt_disable_default(struct request_queue *q)
+void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
 {
-	struct rq_wb *rwb = q->rq_wb;
-
-	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
-		wbt_exit(q);
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	if (rqos)
+		RQWB(rqos)->wc = write_cache_on;
 }
-EXPORT_SYMBOL_GPL(wbt_disable_default);
 
 /*
  * Enable wbt if defaults are configured that way
  */
 void wbt_enable_default(struct request_queue *q)
 {
+	struct rq_qos *rqos = wbt_rq_qos(q);
 	/* Throttling already enabled? */
-	if (q->rq_wb)
+	if (rqos)
 		return;
 
 	/* Queue not registered? Maybe shutting down... */
@@ -741,6 +678,41 @@ static int wbt_data_dir(const struct request *rq)
 	return -1;
 }
 
+static void wbt_exit(struct rq_qos *rqos)
+{
+	struct rq_wb *rwb = RQWB(rqos);
+	struct request_queue *q = rqos->q;
+
+	blk_stat_remove_callback(q, rwb->cb);
+	blk_stat_free_callback(rwb->cb);
+	kfree(rwb);
+}
+
+/*
+ * Disable wbt, if enabled by default.
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	struct rq_wb *rwb;
+	if (!rqos)
+		return;
+	rwb = RQWB(rqos);
+	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+		rwb->wb_normal = 0;
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+
+static struct rq_qos_ops wbt_rqos_ops = {
+	.throttle = wbt_wait,
+	.issue = wbt_issue,
+	.requeue = wbt_requeue,
+	.done = wbt_done,
+	.cleanup = __wbt_done,
+	.exit = wbt_exit,
+};
+
 int wbt_init(struct request_queue *q)
 {
 	struct rq_wb *rwb;
@@ -756,39 +728,29 @@ int wbt_init(struct request_queue *q)
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < WBT_NUM_RWQ; i++) {
-		atomic_set(&rwb->rq_wait[i].inflight, 0);
-		init_waitqueue_head(&rwb->rq_wait[i].wait);
-	}
+	for (i = 0; i < WBT_NUM_RWQ; i++)
+		rq_wait_init(&rwb->rq_wait[i]);
 
+	rwb->rqos.id = RQ_QOS_WBT;
+	rwb->rqos.ops = &wbt_rqos_ops;
+	rwb->rqos.q = q;
 	rwb->last_comp = rwb->last_issue = jiffies;
-	rwb->queue = q;
 	rwb->win_nsec = RWB_WINDOW_NSEC;
 	rwb->enable_state = WBT_STATE_ON_DEFAULT;
-	wbt_update_limits(rwb);
+	rwb->wc = 1;
+	rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
+	__wbt_update_limits(rwb);
 
 	/*
 	 * Assign rwb and add the stats callback.
 	 */
-	q->rq_wb = rwb;
+	rq_qos_add(q, &rwb->rqos);
 	blk_stat_add_callback(q, rwb->cb);
 
 	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
-	wbt_set_queue_depth(rwb, blk_queue_depth(q));
-	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+	wbt_set_queue_depth(q, blk_queue_depth(q));
+	wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 
 	return 0;
 }
-
-void wbt_exit(struct request_queue *q)
-{
-	struct rq_wb *rwb = q->rq_wb;
-
-	if (rwb) {
-		blk_stat_remove_callback(q, rwb->cb);
-		blk_stat_free_callback(rwb->cb);
-		q->rq_wb = NULL;
-		kfree(rwb);
-	}
-}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 300df531d0a6..53b20a58c0a2 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,6 +9,7 @@
 #include <linux/ktime.h>
 
 #include "blk-stat.h"
+#include "blk-rq-qos.h"
 
 enum wbt_flags {
 	WBT_TRACKED		= 1,	/* write, tracked for throttling */
@@ -35,20 +36,12 @@ enum {
 	WBT_STATE_ON_MANUAL	= 2,
 };
 
-struct rq_wait {
-	wait_queue_head_t wait;
-	atomic_t inflight;
-};
-
 struct rq_wb {
 	/*
 	 * Settings that govern how we throttle
 	 */
 	unsigned int wb_background;		/* background writeback */
 	unsigned int wb_normal;			/* normal writeback */
-	unsigned int wb_max;			/* max throughput writeback */
-	int scale_step;
-	bool scaled_max;
 
 	short enable_state;			/* WBT_STATE_* */
 
@@ -67,15 +60,20 @@ struct rq_wb {
 	void *sync_cookie;
 
 	unsigned int wc;
-	unsigned int queue_depth;
 
 	unsigned long last_issue;		/* last non-throttled issue */
 	unsigned long last_comp;		/* last non-throttled comp */
 	unsigned long min_lat_nsec;
-	struct request_queue *queue;
+	struct rq_qos rqos;
 	struct rq_wait rq_wait[WBT_NUM_RWQ];
+	struct rq_depth rq_depth;
 };
 
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+	return container_of(rqos, struct rq_wb, rqos);
+}
+
 static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 {
 	unsigned int i, ret = 0;
@@ -86,6 +84,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 	return ret;
 }
 
+
 #ifdef CONFIG_BLK_WBT
 
 static inline void wbt_track(struct request *rq, enum wbt_flags flags)
@@ -93,19 +92,16 @@ static inline void wbt_track(struct request *rq, enum wbt_flags flags)
 	rq->wbt_flags |= flags;
 }
 
-void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct request *);
-enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
 int wbt_init(struct request_queue *);
-void wbt_exit(struct request_queue *);
-void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct request *);
-void wbt_issue(struct rq_wb *, struct request *);
+void wbt_update_limits(struct request_queue *);
 void wbt_disable_default(struct request_queue *);
 void wbt_enable_default(struct request_queue *);
 
-void wbt_set_queue_depth(struct rq_wb *, unsigned int);
-void wbt_set_write_cache(struct rq_wb *, bool);
+u64 wbt_get_min_lat(struct request_queue *q);
+void wbt_set_min_lat(struct request_queue *q, u64 val);
+
+void wbt_set_queue_depth(struct request_queue *, unsigned int);
+void wbt_set_write_cache(struct request_queue *, bool);
 
 u64 wbt_default_latency_nsec(struct request_queue *);
 
@@ -114,43 +110,30 @@ u64 wbt_default_latency_nsec(struct request_queue *);
 static inline void wbt_track(struct request *rq, enum wbt_flags flags)
 {
 }
-static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
-{
-}
-static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
-{
-}
-static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
-				      spinlock_t *lock)
-{
-	return 0;
-}
 static inline int wbt_init(struct request_queue *q)
 {
 	return -EINVAL;
 }
-static inline void wbt_exit(struct request_queue *q)
-{
-}
-static inline void wbt_update_limits(struct rq_wb *rwb)
+static inline void wbt_update_limits(struct request_queue *q)
 {
 }
-static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_disable_default(struct request_queue *q)
 {
 }
-static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_enable_default(struct request_queue *q)
 {
 }
-static inline void wbt_disable_default(struct request_queue *q)
+static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
 }
-static inline void wbt_enable_default(struct request_queue *q)
+static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
 {
 }
-static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+static inline u64 wbt_get_min_lat(struct request_queue *q)
 {
+	return 0;
 }
-static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
 {
 }
 static inline u64 wbt_default_latency_nsec(struct request_queue *q)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9d05646d5059..137759862f07 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -42,7 +42,7 @@ struct bsg_job;
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
-struct rq_wb;
+struct rq_qos;
 struct blk_queue_stats;
 struct blk_stat_callback;
 
@@ -443,7 +443,7 @@ struct request_queue {
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
 	struct blk_queue_stats	*stats;
-	struct rq_wb		*rq_wb;
+	struct rq_qos		*rq_qos;
 
 	/*
 	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
-- 
cgit v1.2.3


From d70675121546c35feaceebf7ed9caed8716640f3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 3 Jul 2018 11:15:01 -0400
Subject: block: introduce blk-iolatency io controller

Current IO controllers for the block layer are less than ideal for our
use case.  The io.max controller is great at hard limiting, but it is
not work conserving.  This patch introduces io.latency.  You provide a
latency target for your group and we monitor the io in short windows to
make sure we are not exceeding those latency targets.  This makes use of
the rq-qos infrastructure and works much like the wbt stuff.  There are
a few differences from wbt

 - It's bio based, so the latency covers the whole block layer in addition to
   the actual io.
 - We will throttle all IO types that comes in here if we need to.
 - We use the mean latency over the 100ms window.  This is because writes can
   be particularly fast, which could give us a false sense of the impact of
   other workloads on our protected workload.
 - By default there's no throttling, we set the queue_depth to INT_MAX so that
   we can have as many outstanding bio's as we're allowed to.  Only at
   throttle time do we pay attention to the actual queue depth.
 - We backcharge cgroups for root cg issued IO and induce artificial
   delays in order to deal with cases like metadata only or swap heavy
   workloads.

In testing this has worked out relatively well.  Protected workloads
will throttle noisy workloads down to 1 io at time if they are doing
normal IO on their own, or induce up to a 1 second delay per syscall if
they are doing a lot of root issued IO (metadata/swap IO).

Our testing has revolved mostly around our production web servers where
we have hhvm (the web server application) in a protected group and
everything else in another group.  We see slightly higher requests per
second (RPS) on the test tier vs the control tier, and much more stable
RPS across all machines in the test tier vs the control tier.

Another test we run is a slow memory allocator in the unprotected group.
Before this would eventually push us into swap and cause the whole box
to die and not recover at all.  With these patches we see slight RPS
drops (usually 10-15%) before the memory consumer is properly killed and
things recover within seconds.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig             |  12 +
 block/Makefile            |   1 +
 block/blk-cgroup.c        |   8 +
 block/blk-iolatency.c     | 930 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk.h               |   6 +
 include/linux/blk_types.h |   2 -
 6 files changed, 957 insertions(+), 2 deletions(-)
 create mode 100644 block/blk-iolatency.c

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index dfe7bc770fc9..1f2469a0123c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -149,6 +149,18 @@ config BLK_WBT
 	dynamically on an algorithm loosely based on CoDel, factoring in
 	the realtime performance of the disk.
 
+config BLK_CGROUP_IOLATENCY
+	bool "Enable support for latency based cgroup IO protection"
+	depends on BLK_CGROUP=y
+	default n
+	---help---
+	Enabling this option enables the .latency interface for IO throttling.
+	The IO controller will attempt to maintain average IO latencies below
+	the configured latency target, throttling anybody with a higher latency
+	target than the victimized group.
+
+	Note, this is an experimental interface and could be changed someday.
+
 config BLK_WBT_SQ
 	bool "Single queue writeback throttling"
 	default n
diff --git a/block/Makefile b/block/Makefile
index 57d0f47ab05f..572b33f32c07 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3310ec96c2a..7e2c19ce1a08 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1238,6 +1238,14 @@ int blkcg_init_queue(struct request_queue *q)
 	if (preloaded)
 		radix_tree_preload_end();
 
+	ret = blk_iolatency_init(q);
+	if (ret) {
+		spin_lock_irq(q->queue_lock);
+		blkg_destroy_all(q);
+		spin_unlock_irq(q->queue_lock);
+		return ret;
+	}
+
 	ret = blk_throtl_init(q);
 	if (ret) {
 		spin_lock_irq(q->queue_lock);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
new file mode 100644
index 000000000000..a35a1f580337
--- /dev/null
+++ b/block/blk-iolatency.c
@@ -0,0 +1,930 @@
+/*
+ * Block rq-qos base io controller
+ *
+ * This works similar to wbt with a few exceptions
+ *
+ * - It's bio based, so the latency covers the whole block layer in addition to
+ *   the actual io.
+ * - We will throttle all IO that comes in here if we need to.
+ * - We use the mean latency over the 100ms window.  This is because writes can
+ *   be particularly fast, which could give us a false sense of the impact of
+ *   other workloads on our protected workload.
+ * - By default there's no throttling, we set the queue_depth to INT_MAX so that
+ *   we can have as many outstanding bio's as we're allowed to.  Only at
+ *   throttle time do we pay attention to the actual queue depth.
+ *
+ * The hierarchy works like the cpu controller does, we track the latency at
+ * every configured node, and each configured node has it's own independent
+ * queue depth.  This means that we only care about our latency targets at the
+ * peer level.  Some group at the bottom of the hierarchy isn't going to affect
+ * a group at the end of some other path if we're only configred at leaf level.
+ *
+ * Consider the following
+ *
+ *                   root blkg
+ *             /                     \
+ *        fast (target=5ms)     slow (target=10ms)
+ *         /     \                  /        \
+ *       a        b          normal(15ms)   unloved
+ *
+ * "a" and "b" have no target, but their combined io under "fast" cannot exceed
+ * an average latency of 5ms.  If it does then we will throttle the "slow"
+ * group.  In the case of "normal", if it exceeds its 15ms target, we will
+ * throttle "unloved", but nobody else.
+ *
+ * In this example "fast", "slow", and "normal" will be the only groups actually
+ * accounting their io latencies.  We have to walk up the heirarchy to the root
+ * on every submit and complete so we can do the appropriate stat recording and
+ * adjust the queue depth of ourselves if needed.
+ *
+ * There are 2 ways we throttle IO.
+ *
+ * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
+ * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
+ * to 1.  If the group is only ever submitting IO for itself then this is the
+ * only way we throttle.
+ *
+ * 2) Induced delay throttling.  This is for the case that a group is generating
+ * IO that has to be issued by the root cg to avoid priority inversion. So think
+ * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
+ * of work done for us on behalf of the root cg and are being asked to scale
+ * down more then we induce a latency at userspace return.  We accumulate the
+ * total amount of time we need to be punished by doing
+ *
+ * total_time += min_lat_nsec - actual_io_completion
+ *
+ * and then at throttle time will do
+ *
+ * throttle_time = min(total_time, NSEC_PER_SEC)
+ *
+ * This induced delay will throttle back the activity that is generating the
+ * root cg issued io's, wethere that's some metadata intensive operation or the
+ * group is using so much memory that it is pushing us into swap.
+ *
+ * Copyright (C) 2018 Josef Bacik
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/signal.h>
+#include <trace/events/block.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+
+#define DEFAULT_SCALE_COOKIE 1000000U
+
+static struct blkcg_policy blkcg_policy_iolatency;
+struct iolatency_grp;
+
+struct blk_iolatency {
+	struct rq_qos rqos;
+	struct timer_list timer;
+	atomic_t enabled;
+};
+
+static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
+{
+	return container_of(rqos, struct blk_iolatency, rqos);
+}
+
+static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
+{
+	return atomic_read(&blkiolat->enabled) > 0;
+}
+
+struct child_latency_info {
+	spinlock_t lock;
+
+	/* Last time we adjusted the scale of everybody. */
+	u64 last_scale_event;
+
+	/* The latency that we missed. */
+	u64 scale_lat;
+
+	/* Total io's from all of our children for the last summation. */
+	u64 nr_samples;
+
+	/* The guy who actually changed the latency numbers. */
+	struct iolatency_grp *scale_grp;
+
+	/* Cookie to tell if we need to scale up or down. */
+	atomic_t scale_cookie;
+};
+
+struct iolatency_grp {
+	struct blkg_policy_data pd;
+	struct blk_rq_stat __percpu *stats;
+	struct blk_iolatency *blkiolat;
+	struct rq_depth rq_depth;
+	struct rq_wait rq_wait;
+	atomic64_t window_start;
+	atomic_t scale_cookie;
+	u64 min_lat_nsec;
+	u64 cur_win_nsec;
+
+	/* total running average of our io latency. */
+	u64 total_lat_avg;
+	u64 total_lat_nr;
+
+	/* Our current number of IO's for the last summation. */
+	u64 nr_samples;
+
+	struct child_latency_info child_lat;
+};
+
+static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
+}
+
+static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
+{
+	return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
+}
+
+static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
+{
+	return pd_to_blkg(&iolat->pd);
+}
+
+static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
+				       wait_queue_entry_t *wait,
+				       bool first_block)
+{
+	struct rq_wait *rqw = &iolat->rq_wait;
+
+	if (first_block && waitqueue_active(&rqw->wait) &&
+	    rqw->wait.head.next != &wait->entry)
+		return false;
+	return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+}
+
+static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
+				       struct iolatency_grp *iolat,
+				       spinlock_t *lock, bool issue_as_root,
+				       bool use_memdelay)
+	__releases(lock)
+	__acquires(lock)
+{
+	struct rq_wait *rqw = &iolat->rq_wait;
+	unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
+	DEFINE_WAIT(wait);
+	bool first_block = true;
+
+	if (use_delay)
+		blkcg_schedule_throttle(rqos->q, use_memdelay);
+
+	/*
+	 * To avoid priority inversions we want to just take a slot if we are
+	 * issuing as root.  If we're being killed off there's no point in
+	 * delaying things, we may have been killed by OOM so throttling may
+	 * make recovery take even longer, so just let the IO's through so the
+	 * task can go away.
+	 */
+	if (issue_as_root || fatal_signal_pending(current)) {
+		atomic_inc(&rqw->inflight);
+		return;
+	}
+
+	if (iolatency_may_queue(iolat, &wait, first_block))
+		return;
+
+	do {
+		prepare_to_wait_exclusive(&rqw->wait, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		if (iolatency_may_queue(iolat, &wait, first_block))
+			break;
+		first_block = false;
+
+		if (lock) {
+			spin_unlock_irq(lock);
+			io_schedule();
+			spin_lock_irq(lock);
+		} else {
+			io_schedule();
+		}
+	} while (1);
+
+	finish_wait(&rqw->wait, &wait);
+}
+
+#define SCALE_DOWN_FACTOR 2
+#define SCALE_UP_FACTOR 4
+
+static inline unsigned long scale_amount(unsigned long qd, bool up)
+{
+	return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
+}
+
+/*
+ * We scale the qd down faster than we scale up, so we need to use this helper
+ * to adjust the scale_cookie accordingly so we don't prematurely get
+ * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
+ *
+ * Each group has their own local copy of the last scale cookie they saw, so if
+ * the global scale cookie goes up or down they know which way they need to go
+ * based on their last knowledge of it.
+ */
+static void scale_cookie_change(struct blk_iolatency *blkiolat,
+				struct child_latency_info *lat_info,
+				bool up)
+{
+	unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+	unsigned long scale = scale_amount(qd, up);
+	unsigned long old = atomic_read(&lat_info->scale_cookie);
+	unsigned long max_scale = qd << 1;
+	unsigned long diff = 0;
+
+	if (old < DEFAULT_SCALE_COOKIE)
+		diff = DEFAULT_SCALE_COOKIE - old;
+
+	if (up) {
+		if (scale + old > DEFAULT_SCALE_COOKIE)
+			atomic_set(&lat_info->scale_cookie,
+				   DEFAULT_SCALE_COOKIE);
+		else if (diff > qd)
+			atomic_inc(&lat_info->scale_cookie);
+		else
+			atomic_add(scale, &lat_info->scale_cookie);
+	} else {
+		/*
+		 * We don't want to dig a hole so deep that it takes us hours to
+		 * dig out of it.  Just enough that we don't throttle/unthrottle
+		 * with jagged workloads but can still unthrottle once pressure
+		 * has sufficiently dissipated.
+		 */
+		if (diff > qd) {
+			if (diff < max_scale)
+				atomic_dec(&lat_info->scale_cookie);
+		} else {
+			atomic_sub(scale, &lat_info->scale_cookie);
+		}
+	}
+}
+
+/*
+ * Change the queue depth of the iolatency_grp.  We add/subtract 1/16th of the
+ * queue depth at a time so we don't get wild swings and hopefully dial in to
+ * fairer distribution of the overall queue depth.
+ */
+static void scale_change(struct iolatency_grp *iolat, bool up)
+{
+	unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+	unsigned long scale = scale_amount(qd, up);
+	unsigned long old = iolat->rq_depth.max_depth;
+	bool changed = false;
+
+	if (old > qd)
+		old = qd;
+
+	if (up) {
+		if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
+			return;
+
+		if (old < qd) {
+			changed = true;
+			old += scale;
+			old = min(old, qd);
+			iolat->rq_depth.max_depth = old;
+			wake_up_all(&iolat->rq_wait.wait);
+		}
+	} else if (old > 1) {
+		old >>= 1;
+		changed = true;
+		iolat->rq_depth.max_depth = max(old, 1UL);
+	}
+}
+
+/* Check our parent and see if the scale cookie has changed. */
+static void check_scale_change(struct iolatency_grp *iolat)
+{
+	struct iolatency_grp *parent;
+	struct child_latency_info *lat_info;
+	unsigned int cur_cookie;
+	unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
+	u64 scale_lat;
+	unsigned int old;
+	int direction = 0;
+
+	if (lat_to_blkg(iolat)->parent == NULL)
+		return;
+
+	parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
+	if (!parent)
+		return;
+
+	lat_info = &parent->child_lat;
+	cur_cookie = atomic_read(&lat_info->scale_cookie);
+	scale_lat = READ_ONCE(lat_info->scale_lat);
+
+	if (cur_cookie < our_cookie)
+		direction = -1;
+	else if (cur_cookie > our_cookie)
+		direction = 1;
+	else
+		return;
+
+	old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
+
+	/* Somebody beat us to the punch, just bail. */
+	if (old != our_cookie)
+		return;
+
+	if (direction < 0 && iolat->min_lat_nsec) {
+		u64 samples_thresh;
+
+		if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
+			return;
+
+		/*
+		 * Sometimes high priority groups are their own worst enemy, so
+		 * instead of taking it out on some poor other group that did 5%
+		 * or less of the IO's for the last summation just skip this
+		 * scale down event.
+		 */
+		samples_thresh = lat_info->nr_samples * 5;
+		samples_thresh = div64_u64(samples_thresh, 100);
+		if (iolat->nr_samples <= samples_thresh)
+			return;
+	}
+
+	/* We're as low as we can go. */
+	if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+		blkcg_use_delay(lat_to_blkg(iolat));
+		return;
+	}
+
+	/* We're back to the default cookie, unthrottle all the things. */
+	if (cur_cookie == DEFAULT_SCALE_COOKIE) {
+		blkcg_clear_delay(lat_to_blkg(iolat));
+		iolat->rq_depth.max_depth = INT_MAX;
+		wake_up_all(&iolat->rq_wait.wait);
+		return;
+	}
+
+	scale_change(iolat, direction > 0);
+}
+
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
+				     spinlock_t *lock)
+{
+	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	struct request_queue *q = rqos->q;
+	bool issue_as_root = bio_issue_as_root_blkg(bio);
+
+	if (!blk_iolatency_enabled(blkiolat))
+		return;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+	bio_associate_blkcg(bio, &blkcg->css);
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg)) {
+		if (!lock)
+			spin_lock_irq(q->queue_lock);
+		blkg = blkg_lookup_create(blkcg, q);
+		if (IS_ERR(blkg))
+			blkg = NULL;
+		if (!lock)
+			spin_unlock_irq(q->queue_lock);
+	}
+	if (!blkg)
+		goto out;
+
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+	bio_associate_blkg(bio, blkg);
+out:
+	rcu_read_unlock();
+	while (blkg && blkg->parent) {
+		struct iolatency_grp *iolat = blkg_to_lat(blkg);
+		if (!iolat) {
+			blkg = blkg->parent;
+			continue;
+		}
+
+		check_scale_change(iolat);
+		__blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+				     (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+		blkg = blkg->parent;
+	}
+	if (!timer_pending(&blkiolat->timer))
+		mod_timer(&blkiolat->timer, jiffies + HZ);
+}
+
+static void iolatency_record_time(struct iolatency_grp *iolat,
+				  struct bio_issue *issue, u64 now,
+				  bool issue_as_root)
+{
+	struct blk_rq_stat *rq_stat;
+	u64 start = bio_issue_time(issue);
+	u64 req_time;
+
+	if (now <= start)
+		return;
+
+	req_time = now - start;
+
+	/*
+	 * We don't want to count issue_as_root bio's in the cgroups latency
+	 * statistics as it could skew the numbers downwards.
+	 */
+	if (unlikely(issue_as_root && iolat->rq_depth.max_depth != (u64)-1)) {
+		u64 sub = iolat->min_lat_nsec;
+		if (req_time < sub)
+			blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
+		return;
+	}
+
+	rq_stat = get_cpu_ptr(iolat->stats);
+	blk_rq_stat_add(rq_stat, req_time);
+	put_cpu_ptr(rq_stat);
+}
+
+#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
+
+static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
+{
+	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+	struct iolatency_grp *parent;
+	struct child_latency_info *lat_info;
+	struct blk_rq_stat stat;
+	unsigned long flags;
+	int cpu;
+
+	blk_rq_stat_init(&stat);
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		struct blk_rq_stat *s;
+		s = per_cpu_ptr(iolat->stats, cpu);
+		blk_rq_stat_sum(&stat, s);
+		blk_rq_stat_init(s);
+	}
+	preempt_enable();
+
+	/*
+	 * Our average exceeded our window, scale up our window so we are more
+	 * accurate, but not more than the global timer.
+	 */
+	if (stat.mean > iolat->cur_win_nsec) {
+		iolat->cur_win_nsec <<= 1;
+		iolat->cur_win_nsec =
+			max_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC);
+	}
+
+	parent = blkg_to_lat(blkg->parent);
+	if (!parent)
+		return;
+
+	lat_info = &parent->child_lat;
+
+	iolat->total_lat_avg =
+		div64_u64((iolat->total_lat_avg * iolat->total_lat_nr) +
+			  stat.mean, iolat->total_lat_nr + 1);
+
+	iolat->total_lat_nr++;
+
+	/* Everything is ok and we don't need to adjust the scale. */
+	if (stat.mean <= iolat->min_lat_nsec &&
+	    atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
+		return;
+
+	/* Somebody beat us to the punch, just bail. */
+	spin_lock_irqsave(&lat_info->lock, flags);
+	lat_info->nr_samples -= iolat->nr_samples;
+	lat_info->nr_samples += stat.nr_samples;
+	iolat->nr_samples = stat.nr_samples;
+
+	if ((lat_info->last_scale_event >= now ||
+	    now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
+	    lat_info->scale_lat <= iolat->min_lat_nsec)
+		goto out;
+
+	if (stat.mean <= iolat->min_lat_nsec &&
+	    stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+		if (lat_info->scale_grp == iolat) {
+			lat_info->last_scale_event = now;
+			scale_cookie_change(iolat->blkiolat, lat_info, true);
+		}
+	} else if (stat.mean > iolat->min_lat_nsec) {
+		lat_info->last_scale_event = now;
+		if (!lat_info->scale_grp ||
+		    lat_info->scale_lat > iolat->min_lat_nsec) {
+			WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
+			lat_info->scale_grp = iolat;
+		}
+		scale_cookie_change(iolat->blkiolat, lat_info, false);
+	}
+out:
+	spin_unlock_irqrestore(&lat_info->lock, flags);
+}
+
+static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+	struct blkcg_gq *blkg;
+	struct rq_wait *rqw;
+	struct iolatency_grp *iolat;
+	u64 window_start;
+	u64 now = ktime_to_ns(ktime_get());
+	bool issue_as_root = bio_issue_as_root_blkg(bio);
+	bool enabled = false;
+
+	blkg = bio->bi_blkg;
+	if (!blkg)
+		return;
+
+	iolat = blkg_to_lat(bio->bi_blkg);
+	if (!iolat)
+		return;
+
+	enabled = blk_iolatency_enabled(iolat->blkiolat);
+	while (blkg && blkg->parent) {
+		iolat = blkg_to_lat(blkg);
+		if (!iolat) {
+			blkg = blkg->parent;
+			continue;
+		}
+		rqw = &iolat->rq_wait;
+
+		atomic_dec(&rqw->inflight);
+		if (!enabled || iolat->min_lat_nsec == 0)
+			goto next;
+		iolatency_record_time(iolat, &bio->bi_issue, now,
+				      issue_as_root);
+		window_start = atomic64_read(&iolat->window_start);
+		if (now > window_start &&
+		    (now - window_start) >= iolat->cur_win_nsec) {
+			if (atomic64_cmpxchg(&iolat->window_start,
+					window_start, now) == window_start)
+				iolatency_check_latencies(iolat, now);
+		}
+next:
+		wake_up(&rqw->wait);
+		blkg = blkg->parent;
+	}
+}
+
+static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = bio->bi_blkg;
+	while (blkg && blkg->parent) {
+		struct rq_wait *rqw;
+		struct iolatency_grp *iolat;
+
+		iolat = blkg_to_lat(blkg);
+		if (!iolat)
+			goto next;
+
+		rqw = &iolat->rq_wait;
+		atomic_dec(&rqw->inflight);
+		wake_up(&rqw->wait);
+next:
+		blkg = blkg->parent;
+	}
+}
+
+static void blkcg_iolatency_exit(struct rq_qos *rqos)
+{
+	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+
+	del_timer_sync(&blkiolat->timer);
+	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
+	kfree(blkiolat);
+}
+
+static struct rq_qos_ops blkcg_iolatency_ops = {
+	.throttle = blkcg_iolatency_throttle,
+	.cleanup = blkcg_iolatency_cleanup,
+	.done_bio = blkcg_iolatency_done_bio,
+	.exit = blkcg_iolatency_exit,
+};
+
+static void blkiolatency_timer_fn(struct timer_list *t)
+{
+	struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
+	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *pos_css;
+	u64 now = ktime_to_ns(ktime_get());
+
+	rcu_read_lock();
+	blkg_for_each_descendant_pre(blkg, pos_css,
+				     blkiolat->rqos.q->root_blkg) {
+		struct iolatency_grp *iolat;
+		struct child_latency_info *lat_info;
+		unsigned long flags;
+		u64 cookie;
+
+		/*
+		 * We could be exiting, don't access the pd unless we have a
+		 * ref on the blkg.
+		 */
+		if (!blkg_try_get(blkg))
+			continue;
+
+		iolat = blkg_to_lat(blkg);
+		if (!iolat)
+			continue;
+
+		lat_info = &iolat->child_lat;
+		cookie = atomic_read(&lat_info->scale_cookie);
+
+		if (cookie >= DEFAULT_SCALE_COOKIE)
+			goto next;
+
+		spin_lock_irqsave(&lat_info->lock, flags);
+		if (lat_info->last_scale_event >= now)
+			goto next_lock;
+
+		/*
+		 * We scaled down but don't have a scale_grp, scale up and carry
+		 * on.
+		 */
+		if (lat_info->scale_grp == NULL) {
+			scale_cookie_change(iolat->blkiolat, lat_info, true);
+			goto next_lock;
+		}
+
+		/*
+		 * It's been 5 seconds since our last scale event, clear the
+		 * scale grp in case the group that needed the scale down isn't
+		 * doing any IO currently.
+		 */
+		if (now - lat_info->last_scale_event >=
+		    ((u64)NSEC_PER_SEC * 5))
+			lat_info->scale_grp = NULL;
+next_lock:
+		spin_unlock_irqrestore(&lat_info->lock, flags);
+next:
+		blkg_put(blkg);
+	}
+	rcu_read_unlock();
+}
+
+int blk_iolatency_init(struct request_queue *q)
+{
+	struct blk_iolatency *blkiolat;
+	struct rq_qos *rqos;
+	int ret;
+
+	blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
+	if (!blkiolat)
+		return -ENOMEM;
+
+	rqos = &blkiolat->rqos;
+	rqos->id = RQ_QOS_CGROUP;
+	rqos->ops = &blkcg_iolatency_ops;
+	rqos->q = q;
+
+	rq_qos_add(q, rqos);
+
+	ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
+	if (ret) {
+		rq_qos_del(q, rqos);
+		kfree(blkiolat);
+		return ret;
+	}
+
+	timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+
+	return 0;
+}
+
+static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+{
+	struct iolatency_grp *iolat = blkg_to_lat(blkg);
+	struct blk_iolatency *blkiolat = iolat->blkiolat;
+	u64 oldval = iolat->min_lat_nsec;
+
+	iolat->min_lat_nsec = val;
+	iolat->cur_win_nsec = max_t(u64, val << 4, 100 * NSEC_PER_MSEC);
+	iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC);
+
+	if (!oldval && val)
+		atomic_inc(&blkiolat->enabled);
+	if (oldval && !val)
+		atomic_dec(&blkiolat->enabled);
+}
+
+static void iolatency_clear_scaling(struct blkcg_gq *blkg)
+{
+	if (blkg->parent) {
+		struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
+		struct child_latency_info *lat_info;
+		if (!iolat)
+			return;
+
+		lat_info = &iolat->child_lat;
+		spin_lock(&lat_info->lock);
+		atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
+		lat_info->last_scale_event = 0;
+		lat_info->scale_grp = NULL;
+		lat_info->scale_lat = 0;
+		spin_unlock(&lat_info->lock);
+	}
+}
+
+static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off)
+{
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
+	struct blkcg_gq *blkg;
+	struct blk_iolatency *blkiolat;
+	struct blkg_conf_ctx ctx;
+	struct iolatency_grp *iolat;
+	char *p, *tok;
+	u64 lat_val = 0;
+	u64 oldval;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
+	if (ret)
+		return ret;
+
+	iolat = blkg_to_lat(ctx.blkg);
+	blkiolat = iolat->blkiolat;
+	p = ctx.body;
+
+	ret = -EINVAL;
+	while ((tok = strsep(&p, " "))) {
+		char key[16];
+		char val[21];	/* 18446744073709551616 */
+
+		if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
+			goto out;
+
+		if (!strcmp(key, "target")) {
+			u64 v;
+
+			if (!strcmp(val, "max"))
+				lat_val = 0;
+			else if (sscanf(val, "%llu", &v) == 1)
+				lat_val = v * NSEC_PER_USEC;
+			else
+				goto out;
+		} else {
+			goto out;
+		}
+	}
+
+	/* Walk up the tree to see if our new val is lower than it should be. */
+	blkg = ctx.blkg;
+	oldval = iolat->min_lat_nsec;
+
+	iolatency_set_min_lat_nsec(blkg, lat_val);
+	if (oldval != iolat->min_lat_nsec) {
+		iolatency_clear_scaling(blkg);
+	}
+
+	ret = 0;
+out:
+	blkg_conf_finish(&ctx);
+	return ret ?: nbytes;
+}
+
+static u64 iolatency_prfill_limit(struct seq_file *sf,
+				  struct blkg_policy_data *pd, int off)
+{
+	struct iolatency_grp *iolat = pd_to_lat(pd);
+	const char *dname = blkg_dev_name(pd->blkg);
+
+	if (!dname || !iolat->min_lat_nsec)
+		return 0;
+	seq_printf(sf, "%s target=%llu\n",
+		   dname,
+		   (unsigned long long)iolat->min_lat_nsec / NSEC_PER_USEC);
+	return 0;
+}
+
+static int iolatency_print_limit(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  iolatency_prfill_limit,
+			  &blkcg_policy_iolatency, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
+				size_t size)
+{
+	struct iolatency_grp *iolat = pd_to_lat(pd);
+	unsigned long long avg_lat = div64_u64(iolat->total_lat_avg, NSEC_PER_USEC);
+
+	if (iolat->rq_depth.max_depth == (u64)-1)
+		return scnprintf(buf, size, " depth=max avg_lat=%llu",
+				 avg_lat);
+
+	return scnprintf(buf, size, " depth=%u avg_lat=%llu",
+			 iolat->rq_depth.max_depth, avg_lat);
+}
+
+
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+{
+	struct iolatency_grp *iolat;
+
+	iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+	if (!iolat)
+		return NULL;
+	iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
+				       __alignof__(struct blk_rq_stat), gfp);
+	if (!iolat->stats) {
+		kfree(iolat);
+		return NULL;
+	}
+	return &iolat->pd;
+}
+
+static void iolatency_pd_init(struct blkg_policy_data *pd)
+{
+	struct iolatency_grp *iolat = pd_to_lat(pd);
+	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+	struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
+	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+	u64 now = ktime_to_ns(ktime_get());
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct blk_rq_stat *stat;
+		stat = per_cpu_ptr(iolat->stats, cpu);
+		blk_rq_stat_init(stat);
+	}
+
+	rq_wait_init(&iolat->rq_wait);
+	spin_lock_init(&iolat->child_lat.lock);
+	iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+	iolat->rq_depth.max_depth = INT_MAX;
+	iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+	iolat->blkiolat = blkiolat;
+	iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
+	atomic64_set(&iolat->window_start, now);
+
+	/*
+	 * We init things in list order, so the pd for the parent may not be
+	 * init'ed yet for whatever reason.
+	 */
+	if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
+		struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
+		atomic_set(&iolat->scale_cookie,
+			   atomic_read(&parent->child_lat.scale_cookie));
+	} else {
+		atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
+	}
+
+	atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
+}
+
+static void iolatency_pd_offline(struct blkg_policy_data *pd)
+{
+	struct iolatency_grp *iolat = pd_to_lat(pd);
+	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+
+	iolatency_set_min_lat_nsec(blkg, 0);
+	iolatency_clear_scaling(blkg);
+}
+
+static void iolatency_pd_free(struct blkg_policy_data *pd)
+{
+	struct iolatency_grp *iolat = pd_to_lat(pd);
+	free_percpu(iolat->stats);
+	kfree(iolat);
+}
+
+static struct cftype iolatency_files[] = {
+	{
+		.name = "latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = iolatency_print_limit,
+		.write = iolatency_set_limit,
+	},
+	{}
+};
+
+static struct blkcg_policy blkcg_policy_iolatency = {
+	.dfl_cftypes	= iolatency_files,
+	.pd_alloc_fn	= iolatency_pd_alloc,
+	.pd_init_fn	= iolatency_pd_init,
+	.pd_offline_fn	= iolatency_pd_offline,
+	.pd_free_fn	= iolatency_pd_free,
+	.pd_stat_fn	= iolatency_pd_stat,
+};
+
+static int __init iolatency_init(void)
+{
+	return blkcg_policy_register(&blkcg_policy_iolatency);
+}
+
+static void __exit iolatency_exit(void)
+{
+	return blkcg_policy_unregister(&blkcg_policy_iolatency);
+}
+
+module_init(iolatency_init);
+module_exit(iolatency_exit);
diff --git a/block/blk.h b/block/blk.h
index 8d23aea96ce9..69b14cd2bb22 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -412,4 +412,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 
 extern void blk_drain_queue(struct request_queue *q);
 
+#ifdef CONFIG_BLK_CGROUP_IOLATENCY
+extern int blk_iolatency_init(struct request_queue *q);
+#else
+static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
+#endif
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0ffc34c5cc83..e13449a379a1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -180,9 +180,7 @@ struct bio {
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	struct bio_issue	bi_issue;
-#endif
 #endif
 	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
cgit v1.2.3


From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:38 -0400
Subject: net: Add support for subordinate device traffic classes

This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the subordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h | 16 ++++++++-
 net/core/dev.c            | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/net-sysfs.c      | 21 ++++++++++-
 3 files changed, 124 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971e500d..b1ff77276bc4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,6 +575,9 @@ struct netdev_queue {
 	 * (/sys/class/net/DEV/Q/trans_timeout)
 	 */
 	unsigned long		trans_timeout;
+
+	/* Subordinate device that the queue has been assigned to */
+	struct net_device	*sb_dev;
 /*
  * write-mostly part
  */
@@ -1991,7 +1994,7 @@ struct net_device {
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-	u8			num_tc;
+	s16			num_tc;
 	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
 	u8			prio_tc_map[TC_BITMASK + 1];
 
@@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev)
 	return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+			      struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+				 struct net_device *sb_dev,
+				 u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+	return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..cc1d6bba017a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 		int i;
 
+		/* walk through the TCs and see if it falls into any of them */
 		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 			if ((txq - tc->offset) < tc->count)
 				return i;
 		}
 
+		/* didn't find it, just return -1 to indicate no match */
 		return -1;
 	}
 
@@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	unsigned int nr_ids;
 
 	if (dev->num_tc) {
+		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
+		if (num_tc < 0)
+			return -EINVAL;
+
+		/* If queue belongs to subordinate dev use its map */
+		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 		tc = netdev_txq_to_tc(dev, index);
 		if (tc < 0)
 			return -EINVAL;
@@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+	/* Unbind any subordinate channels */
+	while (txq-- != &dev->_tx[0]) {
+		if (txq->sb_dev)
+			netdev_unbind_sb_channel(dev, txq->sb_dev);
+	}
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
 	netif_reset_xps_queues_gt(dev, 0);
 #endif
+	netdev_unbind_all_sb_channels(dev);
+
+	/* Reset TC configuration of device */
 	dev->num_tc = 0;
 	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 #ifdef CONFIG_XPS
 	netif_reset_xps_queues_gt(dev, 0);
 #endif
+	netdev_unbind_all_sb_channels(dev);
+
 	dev->num_tc = num_tc;
 	return 0;
 }
 EXPORT_SYMBOL(netdev_set_num_tc);
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+			      struct net_device *sb_dev)
+{
+	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+	netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+	while (txq-- != &dev->_tx[0]) {
+		if (txq->sb_dev == sb_dev)
+			txq->sb_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+				 struct net_device *sb_dev,
+				 u8 tc, u16 count, u16 offset)
+{
+	/* Make certain the sb_dev and dev are already configured */
+	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+		return -EINVAL;
+
+	/* We cannot hand out queues we don't have */
+	if ((offset + count) > dev->real_num_tx_queues)
+		return -EINVAL;
+
+	/* Record the mapping */
+	sb_dev->tc_to_txq[tc].count = count;
+	sb_dev->tc_to_txq[tc].offset = offset;
+
+	/* Provide a way for Tx queue to find the tc_to_txq map or
+	 * XPS map for itself.
+	 */
+	while (count--)
+		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+	/* Do not use a multiqueue device to represent a subordinate channel */
+	if (netif_is_multiqueue(dev))
+		return -ENODEV;
+
+	/* We allow channels 1 - 32767 to be used for subordinate channels.
+	 * Channel 0 is meant to be "native" mode and used only to represent
+	 * the main root device. We allow writing 0 to reset the device back
+	 * to normal mode after being used as a subordinate channel.
+	 */
+	if (channel > S16_MAX)
+		return -EINVAL;
+
+	dev->num_tc = -channel;
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index dce3ae0fbca2..ffa1d18f2c2c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
 		return -ENOENT;
 
 	index = get_netdev_queue_index(queue);
+
+	/* If queue belongs to subordinate dev use its TC mapping */
+	dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 	tc = netdev_txq_to_tc(dev, index);
 	if (tc < 0)
 		return -EINVAL;
 
-	return sprintf(buf, "%u\n", tc);
+	/* We can report the traffic class one of two ways:
+	 * Subordinate device traffic classes are reported with the traffic
+	 * class first, and then the subordinate class so for example TC0 on
+	 * subordinate device 2 will be reported as "0-2". If the queue
+	 * belongs to the root device it will be reported with just the
+	 * traffic class, so just "0" for TC 0 for example.
+	 */
+	return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
+				 sprintf(buf, "%u\n", tc);
 }
 
 #ifdef CONFIG_XPS
@@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 	index = get_netdev_queue_index(queue);
 
 	if (dev->num_tc) {
+		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
+		if (num_tc < 0)
+			return -EINVAL;
+
+		/* If queue belongs to subordinate dev use its map */
+		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
 		tc = netdev_txq_to_tc(dev, index);
 		if (tc < 0)
 			return -EINVAL;
-- 
cgit v1.2.3


From eadec877ce9ca46a94e9036b5a44e7941d4fc501 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:48 -0400
Subject: net: Add support for subordinate traffic classes to netdev_pick_tx

This change makes it so that we can support the concept of subordinate
device traffic classes to the core networking code. In doing this we can
start pulling out the driver specific bits needed to support selecting a
queue based on an upper device.

The solution at is currently stands is only partially implemented. I have
the start of some XPS bits in here, but I would still need to allow for
configuration of the XPS maps on the queues reserved for the subordinate
devices. For now I am using the reference to the sb_dev XPS map as just a
way to skip the lookup of the lower device XPS map for now as that would
result in the wrong queue being picked.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 +++------
 drivers/net/macvlan.c                         | 10 +----
 include/linux/netdevice.h                     |  4 +-
 net/core/dev.c                                | 58 ++++++++++++++++-----------
 4 files changed, 45 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 80225af2acb1..abb176df2e7f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8208,20 +8208,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
 					      input, common, ring->queue_index);
 }
 
+#ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 			      void *accel_priv, select_queue_fallback_t fallback)
 {
-	struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-#ifdef IXGBE_FCOE
 	struct ixgbe_adapter *adapter;
 	struct ixgbe_ring_feature *f;
-#endif
 	int txq;
 
-	if (fwd_adapter) {
-		u8 tc = netdev_get_num_tc(dev) ?
-			netdev_get_prio_tc_map(dev, skb->priority) : 0;
-		struct net_device *vdev = fwd_adapter->netdev;
+	if (accel_priv) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		struct net_device *vdev = accel_priv;
 
 		txq = vdev->tc_to_txq[tc].offset;
 		txq += reciprocal_scale(skb_get_hash(skb),
@@ -8230,8 +8227,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 		return txq;
 	}
 
-#ifdef IXGBE_FCOE
-
 	/*
 	 * only execute the code below if protocol is FCoE
 	 * or FIP and we have FCoE enabled on the adapter
@@ -8257,11 +8252,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 		txq -= f->indices;
 
 	return txq + f->offset;
-#else
-	return fallback(dev, skb);
-#endif
 }
 
+#endif
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 			       struct xdp_frame *xdpf)
 {
@@ -10058,7 +10051,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
-	.ndo_select_queue	= ixgbe_select_queue,
 	.ndo_set_rx_mode	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= ixgbe_set_mac,
@@ -10081,6 +10073,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
 #ifdef IXGBE_FCOE
+	.ndo_select_queue	= ixgbe_select_queue,
 	.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
 	.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,
 	.ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put,
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index adde8fc45588..401e1d1ce1ec 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 	const struct macvlan_port *port = vlan->port;
 	const struct macvlan_dev *dest;
-	void *accel_priv = NULL;
 
 	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
 		const struct ethhdr *eth = (void *)skb->data;
@@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 			return NET_XMIT_SUCCESS;
 		}
 	}
-
-	/* For packets that are non-multicast and not bridged we will pass
-	 * the necessary information so that the lowerdev can distinguish
-	 * the source of the packets via the accel_priv value.
-	 */
-	accel_priv = vlan->accel_priv;
 xmit_world:
 	skb->dev = vlan->lowerdev;
-	return dev_queue_xmit_accel(skb, accel_priv);
+	return dev_queue_xmit_accel(skb,
+				    netdev_get_sb_channel(dev) ? dev : NULL);
 }
 
 static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b1ff77276bc4..fda0bcda7a42 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2103,7 +2103,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
-				    void *accel_priv);
+				    struct net_device *sb_dev);
 
 /* returns the headroom that the master device needs to take in account
  * when forwarding to this dev
@@ -2568,7 +2568,7 @@ void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index cc1d6bba017a..09a7cc2f3c55 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2786,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach);
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
  */
-static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(const struct net_device *dev,
+		       const struct net_device *sb_dev,
+		       struct sk_buff *skb)
 {
 	u32 hash;
 	u16 qoffset = 0;
 	u16 qcount = dev->real_num_tx_queues;
 
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+		qoffset = sb_dev->tc_to_txq[tc].offset;
+		qcount = sb_dev->tc_to_txq[tc].count;
+	}
+
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
 		while (unlikely(hash >= qcount))
 			hash -= qcount;
-		return hash;
-	}
-
-	if (dev->num_tc) {
-		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-
-		qoffset = dev->tc_to_txq[tc].offset;
-		qcount = dev->tc_to_txq[tc].count;
+		return hash + qoffset;
 	}
 
 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3573,7 +3575,8 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 }
 #endif
 
-static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+			 struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
 	struct xps_dev_maps *dev_maps;
@@ -3587,7 +3590,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	if (!static_key_false(&xps_rxqs_needed))
 		goto get_cpus_map;
 
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
+	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 	if (dev_maps) {
 		int tci = sk_rx_queue_get(sk);
 
@@ -3598,7 +3601,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
 get_cpus_map:
 	if (queue_index < 0) {
-		dev_maps = rcu_dereference(dev->xps_cpus_map);
+		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 		if (dev_maps) {
 			unsigned int tci = skb->sender_cpu - 1;
 
@@ -3614,17 +3617,20 @@ get_cpus_map:
 #endif
 }
 
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+			     struct net_device *sb_dev)
 {
 	struct sock *sk = skb->sk;
 	int queue_index = sk_tx_queue_get(sk);
 
+	sb_dev = sb_dev ? : dev;
+
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
+		int new_index = get_xps_queue(dev, sb_dev, skb);
 
 		if (new_index < 0)
-			new_index = skb_tx_hash(dev, skb);
+			new_index = skb_tx_hash(dev, sb_dev, skb);
 
 		if (queue_index != new_index && sk &&
 		    sk_fullsock(sk) &&
@@ -3637,9 +3643,15 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 	return queue_index;
 }
 
+static u16 __netdev_pick_tx(struct net_device *dev,
+			    struct sk_buff *skb)
+{
+	return ___netdev_pick_tx(dev, skb, NULL);
+}
+
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
-				    void *accel_priv)
+				    struct net_device *sb_dev)
 {
 	int queue_index = 0;
 
@@ -3654,10 +3666,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 		const struct net_device_ops *ops = dev->netdev_ops;
 
 		if (ops->ndo_select_queue)
-			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+			queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
 							    __netdev_pick_tx);
 		else
-			queue_index = __netdev_pick_tx(dev, skb);
+			queue_index = ___netdev_pick_tx(dev, skb, sb_dev);
 
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
@@ -3669,7 +3681,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 /**
  *	__dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
- *	@accel_priv: private data used for L2 forwarding offload
+ *	@sb_dev: suboordinate device used for L2 forwarding offload
  *
  *	Queue a buffer for transmission to a network device. The caller must
  *	have set the device and priority and built the buffer before calling
@@ -3692,7 +3704,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
  *      the BH enable code must have IRQs enabled so that it will not deadlock.
  *          --BLG
  */
-static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -3731,7 +3743,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	else
 		skb_dst_force(skb);
 
-	txq = netdev_pick_tx(dev, skb, accel_priv);
+	txq = netdev_pick_tx(dev, skb, sb_dev);
 	q = rcu_dereference_bh(txq->qdisc);
 
 	trace_net_dev_queue(skb);
@@ -3805,9 +3817,9 @@ int dev_queue_xmit(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_queue_xmit);
 
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 {
-	return __dev_queue_xmit(skb, accel_priv);
+	return __dev_queue_xmit(skb, sb_dev);
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
-- 
cgit v1.2.3


From a4ea8a3dacc312c3402c78f6e4843afdda9b43a0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:54 -0400
Subject: net: Add generic ndo_select_queue functions

This patch adds a generic version of the ndo_select_queue functions for
either returning 0 or selecting a queue based on the processor ID. This is
generally meant to just reduce the number of functions we have to change
in the future when we have to deal with ndo_select_queue changes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/lantiq_etop.c   | 10 +---------
 drivers/net/ethernet/ti/netcp_core.c |  9 +--------
 drivers/staging/netlogic/xlr_net.c   |  9 +--------
 include/linux/netdevice.h            |  4 ++++
 net/core/dev.c                       | 14 ++++++++++++++
 net/packet/af_packet.c               |  2 +-
 6 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index afc810069440..7a637b51c7d2 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -563,14 +563,6 @@ ltq_etop_set_multicast_list(struct net_device *dev)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static u16
-ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
-		      void *accel_priv, select_queue_fallback_t fallback)
-{
-	/* we are currently only using the first queue */
-	return 0;
-}
-
 static int
 ltq_etop_init(struct net_device *dev)
 {
@@ -641,7 +633,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = {
 	.ndo_set_mac_address = ltq_etop_set_mac_address,
 	.ndo_validate_addr = eth_validate_addr,
 	.ndo_set_rx_mode = ltq_etop_set_multicast_list,
-	.ndo_select_queue = ltq_etop_select_queue,
+	.ndo_select_queue = dev_pick_tx_zero,
 	.ndo_init = ltq_etop_init,
 	.ndo_tx_timeout = ltq_etop_tx_timeout,
 };
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 6ebf110cd594..a1d335a3c5e4 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
 	return err;
 }
 
-static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
-			      void *accel_priv,
-			      select_queue_fallback_t fallback)
-{
-	return 0;
-}
-
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
@@ -1972,7 +1965,7 @@ static const struct net_device_ops netcp_netdev_ops = {
 	.ndo_vlan_rx_add_vid	= netcp_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= netcp_rx_kill_vid,
 	.ndo_tx_timeout		= netcp_ndo_tx_timeout,
-	.ndo_select_queue	= netcp_select_queue,
+	.ndo_select_queue	= dev_pick_tx_zero,
 	.ndo_setup_tc		= netcp_setup_tc,
 };
 
diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c
index e461168313bf..4e6611e4c59b 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
-				void *accel_priv,
-				select_queue_fallback_t fallback)
-{
-	return (u16)smp_processor_id();
-}
-
 static void xlr_hw_set_mac_addr(struct net_device *ndev)
 {
 	struct xlr_net_priv *priv = netdev_priv(ndev);
@@ -403,7 +396,7 @@ static const struct net_device_ops xlr_netdev_ops = {
 	.ndo_open = xlr_net_open,
 	.ndo_stop = xlr_net_stop,
 	.ndo_start_xmit = xlr_net_start_xmit,
-	.ndo_select_queue = xlr_net_select_queue,
+	.ndo_select_queue = dev_pick_tx_cpu_id,
 	.ndo_set_mac_address = xlr_net_set_mac_addr,
 	.ndo_set_rx_mode = xlr_set_rx_mode,
 	.ndo_get_stats64 = xlr_stats,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda0bcda7a42..46f4c44ce3e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev);
 void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+		     void *accel_priv, select_queue_fallback_t fallback);
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09a7cc2f3c55..b5e538032d5e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,6 +3617,20 @@ get_cpus_map:
 #endif
 }
 
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+		     void *accel_priv, select_queue_fallback_t fallback)
+{
+	return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback)
+{
+	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+
 static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 			     struct net_device *sb_dev)
 {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 47931ebfaef3..f37d087ae652 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -277,7 +277,7 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
 
 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 {
-	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+	return dev_pick_tx_cpu_id(dev, skb, NULL, NULL);
 }
 
 static u16 packet_pick_tx_queue(struct sk_buff *skb)
-- 
cgit v1.2.3


From 4f49dec9075aa0277b8c9c657ec31e6361f88724 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:19:59 -0400
Subject: net: allow ndo_select_queue to pass netdev

This patch makes it so that instead of passing a void pointer as the
accel_priv we instead pass a net_device pointer as sb_dev. Making this
change allows us to pass the subordinate device through to the fallback
function eventually so that we can keep the actual code in the
ndo_select_queue call as focused on possible on the exception cases.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/infiniband/hw/hfi1/vnic_main.c            |  2 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c |  4 ++--
 drivers/net/bonding/bond_main.c                   |  3 ++-
 drivers/net/ethernet/amazon/ena/ena_netdev.c      |  3 ++-
 drivers/net/ethernet/broadcom/bcmsysport.c        |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  3 ++-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c     |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c        |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  3 ++-
 drivers/net/ethernet/renesas/ravb_main.c          |  3 ++-
 drivers/net/ethernet/sun/ldmvsw.c                 |  3 ++-
 drivers/net/ethernet/sun/sunvnet.c                |  3 ++-
 drivers/net/hyperv/netvsc_drv.c                   |  4 ++--
 drivers/net/net_failover.c                        |  5 +++--
 drivers/net/team/team.c                           |  3 ++-
 drivers/net/tun.c                                 |  3 ++-
 drivers/net/wireless/marvell/mwifiex/main.c       |  3 ++-
 drivers/net/xen-netback/interface.c               |  2 +-
 drivers/net/xen-netfront.c                        |  3 ++-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c       |  3 ++-
 drivers/staging/rtl8723bs/os_dep/os_intfs.c       |  7 +++----
 include/linux/netdevice.h                         | 11 +++++++----
 net/core/dev.c                                    |  6 ++++--
 net/mac80211/iface.c                              |  4 ++--
 29 files changed, 66 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index 5d65582fe4d9..616fc9b6fad8 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -423,7 +423,7 @@ tx_finish:
 
 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
 				  struct sk_buff *skb,
-				  void *accel_priv,
+				  struct net_device *sb_dev,
 				  select_queue_fallback_t fallback)
 {
 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 0c8aec62a425..61558788b3fa 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb,
 }
 
 static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
-				 void *accel_priv,
+				 struct net_device *sb_dev,
 				 select_queue_fallback_t fallback)
 {
 	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
@@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
 	mdata->entropy = opa_vnic_calc_entropy(skb);
 	mdata->vl = opa_vnic_get_vl(adapter, skb);
 	rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
-					       accel_priv, fallback);
+					       sb_dev, fallback);
 	skb_pull(skb, sizeof(*mdata));
 	return rc;
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 63e3844c5bec..9a2ea3c1f949 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding *bond,
 
 
 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/* This helper function exists to help dev_pick_tx get the correct
 	 * destination queue.  Using a helper function skips a call to
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index f2af87d70594..e3befb1f9204 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2213,7 +2213,8 @@ static void ena_netpoll(struct net_device *netdev)
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
 static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	u16 qid;
 	/* we suspect that this is good for in--kernel network services that
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index d5fca2e5a9bc..32f548e6431d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2107,7 +2107,7 @@ static const struct ethtool_ops bcm_sysport_ethtool_ops = {
 };
 
 static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
-				    void *accel_priv,
+				    struct net_device *sb_dev,
 				    select_queue_fallback_t fallback)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index af7b5a4d8ba0..e4e1cf907ac6 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1910,7 +1910,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
 }
 
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index a8ce5c55bbb0..0e508e5defce 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -497,7 +497,8 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 
 static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
 					struct bnx2x_fastpath *fp,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 0d91716a2566..5dc5e5604f05 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -930,7 +930,8 @@ freeout:
 }
 
 static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	int txq;
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ef9ef703d13a..ff7a74ec8f11 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2022,7 +2022,8 @@ static void hns_nic_get_stats64(struct net_device *ndev,
 
 static u16
 hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback)
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback)
 {
 	struct ethhdr *eth_hdr = (struct ethhdr *)skb->data;
 	struct hns_nic_priv *priv = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index abb176df2e7f..8c7a68c57afa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8210,15 +8210,16 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
 
 #ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
-			      void *accel_priv, select_queue_fallback_t fallback)
+			      struct net_device *sb_dev,
+			      select_queue_fallback_t fallback)
 {
 	struct ixgbe_adapter *adapter;
 	struct ixgbe_ring_feature *f;
 	int txq;
 
-	if (accel_priv) {
+	if (sb_dev) {
 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-		struct net_device *vdev = accel_priv;
+		struct net_device *vdev = sb_dev;
 
 		txq = vdev->tc_to_txq[tc].offset;
 		txq += reciprocal_scale(skb_get_hash(skb),
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 0227786308af..df2996618cd1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -688,7 +688,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
 }
 
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv, select_queue_fallback_t fallback)
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u16 rings_p_up = priv->num_tx_rings_p_up;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ace6545f82e6..c3228b89df46 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -699,7 +699,8 @@ void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
 
 void mlx4_en_tx_irq(struct mlx4_cq *mcq);
 u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
-			 void *accel_priv, select_queue_fallback_t fallback);
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback);
 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
 			       struct mlx4_en_rx_alloc *frame,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e2b7586ed7a0..e1b237ccdf56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -865,7 +865,8 @@ struct mlx5e_profile {
 void mlx5e_build_ptys2ethtool_map(void);
 
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5e_tx_wqe *wqe, u16 pi);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index f0739dae7b56..dfcc3710b65f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -111,7 +111,8 @@ static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb
 #endif
 
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	int channel_ix = fallback(dev, skb);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 68f122140966..4a7f54c8e7aa 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1656,7 +1656,8 @@ drop:
 }
 
 static u16 ravb_select_queue(struct net_device *ndev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/* If skb needs TX timestamp, it is handled in network control queue */
 	return (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) ? RAVB_NC :
diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c
index a5dd627fe2f9..d42f47f6c632 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -101,7 +101,8 @@ static struct vnet_port *vsw_tx_port_find(struct sk_buff *skb,
 }
 
 static u16 vsw_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct vnet_port *port = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index a94f50442613..12539b357a78 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -234,7 +234,8 @@ static struct vnet_port *vnet_tx_port_find(struct sk_buff *skb,
 }
 
 static u16 vnet_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	struct vnet *vp = netdev_priv(dev);
 	struct vnet_port *port = __tx_port_find(vp, skb);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index dd1d6e115145..98c0107d6ca1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -329,7 +329,7 @@ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
 }
 
 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
-			       void *accel_priv,
+			       struct net_device *sb_dev,
 			       select_queue_fallback_t fallback)
 {
 	struct net_device_context *ndc = netdev_priv(ndev);
@@ -343,7 +343,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 
 		if (vf_ops->ndo_select_queue)
 			txq = vf_ops->ndo_select_queue(vf_netdev, skb,
-						       accel_priv, fallback);
+						       sb_dev, fallback);
 		else
 			txq = fallback(vf_netdev, skb);
 
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 4f390fa557e4..78b549698b7b 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -115,7 +115,8 @@ static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
 }
 
 static u16 net_failover_select_queue(struct net_device *dev,
-				     struct sk_buff *skb, void *accel_priv,
+				     struct sk_buff *skb,
+				     struct net_device *sb_dev,
 				     select_queue_fallback_t fallback)
 {
 	struct net_failover_info *nfo_info = netdev_priv(dev);
@@ -128,7 +129,7 @@ static u16 net_failover_select_queue(struct net_device *dev,
 
 		if (ops->ndo_select_queue)
 			txq = ops->ndo_select_queue(primary_dev, skb,
-						    accel_priv, fallback);
+						    sb_dev, fallback);
 		else
 			txq = fallback(primary_dev, skb);
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index b070959737ff..3a95eaae0c98 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1707,7 +1707,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb,
-			     void *accel_priv, select_queue_fallback_t fallback)
+			     struct net_device *sb_dev,
+			     select_queue_fallback_t fallback)
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a192a017cc68..76f0f4131197 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -607,7 +607,8 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 }
 
 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	u16 ret;
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 510f6b8e717d..fa3e8ddfe9a9 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1279,7 +1279,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev)
 
 static u16
 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb,
-				void *accel_priv, select_queue_fallback_t fallback)
+				struct net_device *sb_dev,
+				select_queue_fallback_t fallback)
 {
 	skb->priority = cfg80211_classify8021d(skb, NULL);
 	return mwifiex_1d_to_wmm_queue[skb->priority];
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 78ebe494fef0..19c4c585f472 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -148,7 +148,7 @@ void xenvif_wake_queue(struct xenvif_queue *queue)
 }
 
 static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb,
-			       void *accel_priv,
+			       struct net_device *sb_dev,
 			       select_queue_fallback_t fallback)
 {
 	struct xenvif *vif = netdev_priv(dev);
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a57daecf1d57..d67cd379d156 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -545,7 +545,8 @@ static int xennet_count_skb_slots(struct sk_buff *skb)
 }
 
 static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb,
-			       void *accel_priv, select_queue_fallback_t fallback)
+			       struct net_device *sb_dev,
+			       select_queue_fallback_t fallback)
 {
 	unsigned int num_queues = dev->real_num_tx_queues;
 	u32 hash;
diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
index add1ba00f3e9..38e85c8a85c8 100644
--- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c
@@ -253,7 +253,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 }
 
 static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
index ace68f023b49..181642358e3f 100644
--- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
@@ -403,10 +403,9 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb)
 }
 
 
-static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb
-				, void *accel_priv
-				, select_queue_fallback_t fallback
-)
+static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev,
+			    select_queue_fallback_t fallback)
 {
 	struct adapter	*padapter = rtw_netdev_priv(dev);
 	struct mlme_priv *pmlmepriv = &padapter->mlmepriv;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46f4c44ce3e4..bbf062c1ca8a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -957,7 +957,8 @@ struct dev_ifalias {
  *	those the driver believes to be appropriate.
  *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
- *                         void *accel_priv, select_queue_fallback_t fallback);
+ *                         struct net_device *sb_dev,
+ *                         select_queue_fallback_t fallback);
  *	Called to decide which queue to use when device supports multiple
  *	transmit queues.
  *
@@ -1229,7 +1230,7 @@ struct net_device_ops {
 						      netdev_features_t features);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
-						    void *accel_priv,
+						    struct net_device *sb_dev,
 						    select_queue_fallback_t fallback);
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
@@ -2568,9 +2569,11 @@ void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback);
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback);
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback);
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index b5e538032d5e..a051ce27198b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3618,14 +3618,16 @@ get_cpus_map:
 }
 
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
-		     void *accel_priv, select_queue_fallback_t fallback)
+		     struct net_device *sb_dev,
+		     select_queue_fallback_t fallback)
 {
 	return 0;
 }
 EXPORT_SYMBOL(dev_pick_tx_zero);
 
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       void *accel_priv, select_queue_fallback_t fallback)
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
 {
 	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 }
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 555e389b7dfa..5e6cf2cee965 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev)
 
 static u16 ieee80211_netdev_select_queue(struct net_device *dev,
 					 struct sk_buff *skb,
-					 void *accel_priv,
+					 struct net_device *sb_dev,
 					 select_queue_fallback_t fallback)
 {
 	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
@@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
 					  struct sk_buff *skb,
-					  void *accel_priv,
+					  struct net_device *sb_dev,
 					  select_queue_fallback_t fallback)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-- 
cgit v1.2.3


From 8ec56fc3c5ee6f9700adac190e9ce5b8859a58b6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 9 Jul 2018 12:20:04 -0400
Subject: net: allow fallback function to pass netdev

For most of these calls we can just pass NULL through to the fallback
function as the sb_dev. The only cases where we cannot are the cases where
we might be dealing with either an upper device or a driver that would
have configured things to support an sb_dev itself.

The only driver that has any significant change in this patch set should be
ixgbe as we can drop the redundant functionality that existed in both the
ndo_select_queue function and the fallback function that was passed through
to us.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c    |  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c      |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c                 |  2 +-
 drivers/net/net_failover.c                      |  2 +-
 drivers/net/xen-netback/interface.c             |  2 +-
 include/linux/netdevice.h                       |  3 ++-
 net/core/dev.c                                  | 12 +++---------
 net/packet/af_packet.c                          |  7 ++++---
 14 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e3befb1f9204..c673ac2df65b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (skb_rx_queue_recorded(skb))
 		qid = skb_get_rx_queue(skb);
 	else
-		qid = fallback(dev, skb);
+		qid = fallback(dev, skb, NULL);
 
 	return qid;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 32f548e6431d..eb890c4b3b2d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
 	unsigned int q, port;
 
 	if (!netdev_uses_dsa(dev))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
 	/* DSA tagging layer will have configured the correct queue */
 	q = BRCM_TAG_GET_QUEUE(queue);
@@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb,
 	tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues];
 
 	if (unlikely(!tx_ring))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
 	return tx_ring->index;
 }
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e4e1cf907ac6..5a727d4729da 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	/* select a non-FCoE queue */
-	return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
+	return fallback(dev, skb, NULL) %
+	       (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5dc5e5604f05..40cf8dc9f163 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
 		return txq;
 	}
 
-	return fallback(dev, skb) % dev->real_num_tx_queues;
+	return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 }
 
 static int closest_timer(const struct sge *s, int time)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ff7a74ec8f11..948b3e0d18f4 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2033,7 +2033,7 @@ hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	    is_multicast_ether_addr(eth_hdr->h_dest))
 		return 0;
 	else
-		return fallback(ndev, skb);
+		return fallback(ndev, skb, NULL);
 }
 
 static const struct net_device_ops hns_nic_netdev_ops = {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8c7a68c57afa..bd6d9ea27b4b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8237,11 +8237,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 	case htons(ETH_P_FIP):
 		adapter = netdev_priv(dev);
 
-		if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
+		if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED))
 			break;
 		/* fall through */
 	default:
-		return fallback(dev, skb);
+		return fallback(dev, skb, sb_dev);
 	}
 
 	f = &adapter->ring_feature[RING_F_FCOE];
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index df2996618cd1..1857ee0f0871 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -695,9 +695,9 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u16 rings_p_up = priv->num_tx_rings_p_up;
 
 	if (netdev_get_num_tc(dev))
-		return fallback(dev, skb);
+		return fallback(dev, skb, NULL);
 
-	return fallback(dev, skb) % rings_p_up;
+	return fallback(dev, skb, NULL) % rings_p_up;
 }
 
 static void mlx4_bf_copy(void __iomem *dst, const void *src,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index dfcc3710b65f..9106ea45e3cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -115,7 +115,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 		       select_queue_fallback_t fallback)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
-	int channel_ix = fallback(dev, skb);
+	int channel_ix = fallback(dev, skb, NULL);
 	u16 num_channels;
 	int up = 0;
 
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 98c0107d6ca1..cf4f40a04194 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -345,7 +345,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 			txq = vf_ops->ndo_select_queue(vf_netdev, skb,
 						       sb_dev, fallback);
 		else
-			txq = fallback(vf_netdev, skb);
+			txq = fallback(vf_netdev, skb, NULL);
 
 		/* Record the queue selected by VF so that it can be
 		 * used for common case where VF has more queues than
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 78b549698b7b..d00d42c845b7 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -131,7 +131,7 @@ static u16 net_failover_select_queue(struct net_device *dev,
 			txq = ops->ndo_select_queue(primary_dev, skb,
 						    sb_dev, fallback);
 		else
-			txq = fallback(primary_dev, skb);
+			txq = fallback(primary_dev, skb, NULL);
 
 		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
 
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 19c4c585f472..92274c237200 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -155,7 +155,7 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb,
 	unsigned int size = vif->hash.size;
 
 	if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
-		return fallback(dev, skb) % dev->real_num_tx_queues;
+		return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 
 	xenvif_set_skb_hash(vif, skb);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bbf062c1ca8a..2daf2fa6554f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -793,7 +793,8 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 }
 
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
-				       struct sk_buff *skb);
+				       struct sk_buff *skb,
+				       struct net_device *sb_dev);
 
 enum tc_setup_type {
 	TC_SETUP_QDISC_MQPRIO,
diff --git a/net/core/dev.c b/net/core/dev.c
index a051ce27198b..e18d81837a6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3633,8 +3633,8 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 
-static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
-			     struct net_device *sb_dev)
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+			    struct net_device *sb_dev)
 {
 	struct sock *sk = skb->sk;
 	int queue_index = sk_tx_queue_get(sk);
@@ -3659,12 +3659,6 @@ static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 	return queue_index;
 }
 
-static u16 __netdev_pick_tx(struct net_device *dev,
-			    struct sk_buff *skb)
-{
-	return ___netdev_pick_tx(dev, skb, NULL);
-}
-
 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 				    struct sk_buff *skb,
 				    struct net_device *sb_dev)
@@ -3685,7 +3679,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 			queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
 							    __netdev_pick_tx);
 		else
-			queue_index = ___netdev_pick_tx(dev, skb, sb_dev);
+			queue_index = __netdev_pick_tx(dev, skb, sb_dev);
 
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f37d087ae652..00189a3b07f2 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
 	return po->xmit == packet_direct_xmit;
 }
 
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
+				  struct net_device *sb_dev)
 {
-	return dev_pick_tx_cpu_id(dev, skb, NULL, NULL);
+	return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
 }
 
 static u16 packet_pick_tx_queue(struct sk_buff *skb)
@@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb)
 						    __packet_pick_tx_queue);
 		queue_index = netdev_cap_txqueue(dev, queue_index);
 	} else {
-		queue_index = __packet_pick_tx_queue(dev, skb);
+		queue_index = __packet_pick_tx_queue(dev, skb, NULL);
 	}
 
 	return queue_index;
-- 
cgit v1.2.3


From 9f17dbf04ddf55ae48f5bbafea4c4920ea943215 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 9 Jul 2018 18:10:02 +0100
Subject: netfilter: fix use-after-free in NF_HOOK_LIST

nf_hook() can free the skb, so we need to remove it from the list before
 calling, and add passed skbs to a sublist afterwards.

Fixes: 17266ee93984 ("net: ipv4: listified version of ip_rcv")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 5a5e0a2ab2a3..23b48de8c2e2 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -294,12 +294,16 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct sk_buff *skb, *next;
+	struct list_head sublist;
 
+	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
-		int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
-		if (ret != 1)
-			list_del(&skb->list);
+		list_del(&skb->list);
+		if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1)
+			list_add_tail(&skb->list, &sublist);
 	}
+	/* Put passed packets back on main list */
+	list_splice(&sublist, head);
 }
 
 /* Call setsockopt() */
-- 
cgit v1.2.3


From aef92a8bed25d03b8f03ce499a56e8e8e5e2c05e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Jul 2018 13:42:10 +0200
Subject: watchdog/softlockup: Fix the SOFTLOCKUP_DETECTOR=n build

I got confused by all the various CONFIG options here about and
conflated CONFIG_LOCKUP_DETECTOR and CONFIG_SOFTLOCKUP_DETECTOR.

This results in a build failure for:

   CONFIG_LOCKUP_DETECTOR=y && CONFIG_SOFTLOCKUP_DETECTOR=n

As reported by Abdul.

Reported-and-tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-next <linux-next@vger.kernel.org>
Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: mpe <mpe@ellerman.id.au>
Cc: sachinp <sachinp@linux.vnet.ibm.com>
Cc: stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work")
Link: http://lkml.kernel.org/r/20180710114210.GI2476@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 80664bbeca43..08f9247e9827 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -33,15 +33,10 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 #define sysctl_hardlockup_all_cpu_backtrace 0
 #endif /* !CONFIG_SMP */
 
-extern int lockup_detector_online_cpu(unsigned int cpu);
-extern int lockup_detector_offline_cpu(unsigned int cpu);
-
 #else /* CONFIG_LOCKUP_DETECTOR */
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
 static inline void lockup_detector_cleanup(void) { }
-#define lockup_detector_online_cpu	NULL
-#define lockup_detector_offline_cpu	NULL
 #endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -50,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-#else
+
+extern int lockup_detector_online_cpu(unsigned int cpu);
+extern int lockup_detector_offline_cpu(unsigned int cpu);
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
 static inline void touch_softlockup_watchdog_sched(void) { }
 static inline void touch_softlockup_watchdog(void) { }
 static inline void touch_softlockup_watchdog_sync(void) { }
 static inline void touch_all_softlockup_watchdogs(void) { }
-#endif
+
+#define lockup_detector_online_cpu	NULL
+#define lockup_detector_offline_cpu	NULL
+#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 void reset_hung_task_detector(void);
-- 
cgit v1.2.3


From 8d3e994241e6bcc7ead2b918c4f15b7683afa90a Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 10 Jul 2018 09:57:58 +0100
Subject: arm_pmu: Clean up maximum period handling

Each PMU defines their max_period of the counter as the maximum
value that can be counted. Since all the PMU backends support
32bit counters by default, let us remove the redundant field.

No functional changes.

Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/kernel/perf_event_v6.c     |  2 --
 arch/arm/kernel/perf_event_v7.c     |  1 -
 arch/arm/kernel/perf_event_xscale.c |  2 --
 arch/arm64/kernel/perf_event.c      |  1 -
 drivers/perf/arm_pmu.c              | 16 ++++++++++++----
 include/linux/perf/arm_pmu.h        |  1 -
 6 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index be42c4f66a40..f64a6bfebcec 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -495,7 +495,6 @@ static void armv6pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->stop		= armv6pmu_stop;
 	cpu_pmu->map_event	= armv6_map_event;
 	cpu_pmu->num_events	= 3;
-	cpu_pmu->max_period	= (1LLU << 32) - 1;
 }
 
 static int armv6_1136_pmu_init(struct arm_pmu *cpu_pmu)
@@ -546,7 +545,6 @@ static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->stop		= armv6pmu_stop;
 	cpu_pmu->map_event	= armv6mpcore_map_event;
 	cpu_pmu->num_events	= 3;
-	cpu_pmu->max_period	= (1LLU << 32) - 1;
 
 	return 0;
 }
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 5a5116794440..2cf1ca2925c8 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -1170,7 +1170,6 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->start		= armv7pmu_start;
 	cpu_pmu->stop		= armv7pmu_stop;
 	cpu_pmu->reset		= armv7pmu_reset;
-	cpu_pmu->max_period	= (1LLU << 32) - 1;
 };
 
 static void armv7_read_num_pmnc_events(void *info)
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 88d1a76f5367..c4f029458b52 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -374,7 +374,6 @@ static int xscale1pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->stop		= xscale1pmu_stop;
 	cpu_pmu->map_event	= xscale_map_event;
 	cpu_pmu->num_events	= 3;
-	cpu_pmu->max_period	= (1LLU << 32) - 1;
 
 	return 0;
 }
@@ -743,7 +742,6 @@ static int xscale2pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->stop		= xscale2pmu_stop;
 	cpu_pmu->map_event	= xscale_map_event;
 	cpu_pmu->num_events	= 5;
-	cpu_pmu->max_period	= (1LLU << 32) - 1;
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 33147aacdafd..678ecffd3724 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -960,7 +960,6 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->start			= armv8pmu_start,
 	cpu_pmu->stop			= armv8pmu_stop,
 	cpu_pmu->reset			= armv8pmu_reset,
-	cpu_pmu->max_period		= (1LLU << 32) - 1,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 
 	return 0;
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index a6347d487635..6ddc00da5373 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -28,6 +28,11 @@
 static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
 static DEFINE_PER_CPU(int, cpu_irq);
 
+static inline u64 arm_pmu_max_period(void)
+{
+	return (1ULL << 32) - 1;
+}
+
 static int
 armpmu_map_cache_event(const unsigned (*cache_map)
 				      [PERF_COUNT_HW_CACHE_MAX]
@@ -114,8 +119,10 @@ int armpmu_event_set_period(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
+	u64 max_period;
 	int ret = 0;
 
+	max_period = arm_pmu_max_period();
 	if (unlikely(left <= -period)) {
 		left = period;
 		local64_set(&hwc->period_left, left);
@@ -136,8 +143,8 @@ int armpmu_event_set_period(struct perf_event *event)
 	 * effect we are reducing max_period to account for
 	 * interrupt latency (and we are being very conservative).
 	 */
-	if (left > (armpmu->max_period >> 1))
-		left = armpmu->max_period >> 1;
+	if (left > (max_period >> 1))
+		left = (max_period >> 1);
 
 	local64_set(&hwc->prev_count, (u64)-left);
 
@@ -153,6 +160,7 @@ u64 armpmu_event_update(struct perf_event *event)
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 	u64 delta, prev_raw_count, new_raw_count;
+	u64 max_period = arm_pmu_max_period();
 
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
@@ -162,7 +170,7 @@ again:
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
-	delta = (new_raw_count - prev_raw_count) & armpmu->max_period;
+	delta = (new_raw_count - prev_raw_count) & max_period;
 
 	local64_add(delta, &event->count);
 	local64_sub(delta, &hwc->period_left);
@@ -402,7 +410,7 @@ __hw_perf_event_init(struct perf_event *event)
 		 * is far less likely to overtake the previous one unless
 		 * you have some serious IRQ latency issues.
 		 */
-		hwc->sample_period  = armpmu->max_period >> 1;
+		hwc->sample_period  = arm_pmu_max_period() >> 1;
 		hwc->last_period    = hwc->sample_period;
 		local64_set(&hwc->period_left, hwc->sample_period);
 	}
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index ad5444491975..12c30a22fc8d 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -94,7 +94,6 @@ struct arm_pmu {
 	void		(*reset)(void *);
 	int		(*map_event)(struct perf_event *event);
 	int		num_events;
-	u64		max_period;
 	bool		secure_access; /* 32-bit ARM only */
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
 	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
-- 
cgit v1.2.3


From 3a95200d3f89afd8b67f39d88d36cc7ec96ce385 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 10 Jul 2018 09:57:59 +0100
Subject: arm_pmu: Change API to support 64bit counter values

Convert the {read/write}_counter APIs to handle 64bit values
to enable supporting chained event counters. The backends still
use 32bit values and we pass them 32bit values only. So in effect
there are no functional changes.

Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/kernel/perf_event_v6.c     | 4 ++--
 arch/arm/kernel/perf_event_v7.c     | 4 ++--
 arch/arm/kernel/perf_event_xscale.c | 8 ++++----
 arch/arm64/kernel/perf_event.c      | 9 ++++-----
 include/linux/perf/arm_pmu.h        | 4 ++--
 5 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index f64a6bfebcec..0729f9841ef4 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -233,7 +233,7 @@ armv6_pmcr_counter_has_overflowed(unsigned long pmcr,
 	return ret;
 }
 
-static inline u32 armv6pmu_read_counter(struct perf_event *event)
+static inline u64 armv6pmu_read_counter(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
@@ -251,7 +251,7 @@ static inline u32 armv6pmu_read_counter(struct perf_event *event)
 	return value;
 }
 
-static inline void armv6pmu_write_counter(struct perf_event *event, u32 value)
+static inline void armv6pmu_write_counter(struct perf_event *event, u64 value)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 2cf1ca2925c8..973043dd6187 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -743,7 +743,7 @@ static inline void armv7_pmnc_select_counter(int idx)
 	isb();
 }
 
-static inline u32 armv7pmu_read_counter(struct perf_event *event)
+static inline u64 armv7pmu_read_counter(struct perf_event *event)
 {
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -763,7 +763,7 @@ static inline u32 armv7pmu_read_counter(struct perf_event *event)
 	return value;
 }
 
-static inline void armv7pmu_write_counter(struct perf_event *event, u32 value)
+static inline void armv7pmu_write_counter(struct perf_event *event, u64 value)
 {
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index c4f029458b52..942230fe0426 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -316,7 +316,7 @@ static void xscale1pmu_stop(struct arm_pmu *cpu_pmu)
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static inline u32 xscale1pmu_read_counter(struct perf_event *event)
+static inline u64 xscale1pmu_read_counter(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
@@ -337,7 +337,7 @@ static inline u32 xscale1pmu_read_counter(struct perf_event *event)
 	return val;
 }
 
-static inline void xscale1pmu_write_counter(struct perf_event *event, u32 val)
+static inline void xscale1pmu_write_counter(struct perf_event *event, u64 val)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
@@ -678,7 +678,7 @@ static void xscale2pmu_stop(struct arm_pmu *cpu_pmu)
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static inline u32 xscale2pmu_read_counter(struct perf_event *event)
+static inline u64 xscale2pmu_read_counter(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
@@ -705,7 +705,7 @@ static inline u32 xscale2pmu_read_counter(struct perf_event *event)
 	return val;
 }
 
-static inline void xscale2pmu_write_counter(struct perf_event *event, u32 val)
+static inline void xscale2pmu_write_counter(struct perf_event *event, u64 val)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int counter = hwc->idx;
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 678ecffd3724..66a2ffdca6dd 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -512,7 +512,7 @@ static inline int armv8pmu_select_counter(int idx)
 	return idx;
 }
 
-static inline u32 armv8pmu_read_counter(struct perf_event *event)
+static inline u64 armv8pmu_read_counter(struct perf_event *event)
 {
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -530,7 +530,7 @@ static inline u32 armv8pmu_read_counter(struct perf_event *event)
 	return value;
 }
 
-static inline void armv8pmu_write_counter(struct perf_event *event, u32 value)
+static inline void armv8pmu_write_counter(struct perf_event *event, u64 value)
 {
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -545,9 +545,8 @@ static inline void armv8pmu_write_counter(struct perf_event *event, u32 value)
 		 * count using the lower 32bits and we want an interrupt when
 		 * it overflows.
 		 */
-		u64 value64 = 0xffffffff00000000ULL | value;
-
-		write_sysreg(value64, pmccntr_el0);
+		value |= 0xffffffff00000000ULL;
+		write_sysreg(value, pmccntr_el0);
 	} else if (armv8pmu_select_counter(idx) == idx)
 		write_sysreg(value, pmxevcntr_el0);
 }
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 12c30a22fc8d..f7126a21df30 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -87,8 +87,8 @@ struct arm_pmu {
 					 struct perf_event *event);
 	int		(*set_event_filter)(struct hw_perf_event *evt,
 					    struct perf_event_attr *attr);
-	u32		(*read_counter)(struct perf_event *event);
-	void		(*write_counter)(struct perf_event *event, u32 val);
+	u64		(*read_counter)(struct perf_event *event);
+	void		(*write_counter)(struct perf_event *event, u64 val);
 	void		(*start)(struct arm_pmu *);
 	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
-- 
cgit v1.2.3


From e2da97d328d4951d25f6634eda7213f7257417b6 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Tue, 10 Jul 2018 09:58:00 +0100
Subject: arm_pmu: Add support for 64bit event counters

Each PMU has a set of 32bit event counters. But in some
special cases, the events could be counted using counters
which are effectively 64bit wide.

e.g, Arm V8 PMUv3 has a 64 bit cycle counter which can count
only the CPU cycles. Also, the PMU can chain the event counters
to effectively count as a 64bit counter.

Add support for tracking the events that uses 64bit counters.
This only affects the periods set for each counter in the core
driver.

Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Julien Thierry <julien.thierry@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 16 ++++++++++------
 include/linux/perf/arm_pmu.h |  6 ++++++
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 6ddc00da5373..8cad6b535a2c 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -28,9 +28,12 @@
 static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
 static DEFINE_PER_CPU(int, cpu_irq);
 
-static inline u64 arm_pmu_max_period(void)
+static inline u64 arm_pmu_event_max_period(struct perf_event *event)
 {
-	return (1ULL << 32) - 1;
+	if (event->hw.flags & ARMPMU_EVT_64BIT)
+		return GENMASK_ULL(63, 0);
+	else
+		return GENMASK_ULL(31, 0);
 }
 
 static int
@@ -122,7 +125,7 @@ int armpmu_event_set_period(struct perf_event *event)
 	u64 max_period;
 	int ret = 0;
 
-	max_period = arm_pmu_max_period();
+	max_period = arm_pmu_event_max_period(event);
 	if (unlikely(left <= -period)) {
 		left = period;
 		local64_set(&hwc->period_left, left);
@@ -148,7 +151,7 @@ int armpmu_event_set_period(struct perf_event *event)
 
 	local64_set(&hwc->prev_count, (u64)-left);
 
-	armpmu->write_counter(event, (u64)(-left) & 0xffffffff);
+	armpmu->write_counter(event, (u64)(-left) & max_period);
 
 	perf_event_update_userpage(event);
 
@@ -160,7 +163,7 @@ u64 armpmu_event_update(struct perf_event *event)
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 	u64 delta, prev_raw_count, new_raw_count;
-	u64 max_period = arm_pmu_max_period();
+	u64 max_period = arm_pmu_event_max_period(event);
 
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
@@ -368,6 +371,7 @@ __hw_perf_event_init(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int mapping;
 
+	hwc->flags = 0;
 	mapping = armpmu->map_event(event);
 
 	if (mapping < 0) {
@@ -410,7 +414,7 @@ __hw_perf_event_init(struct perf_event *event)
 		 * is far less likely to overtake the previous one unless
 		 * you have some serious IRQ latency issues.
 		 */
-		hwc->sample_period  = arm_pmu_max_period() >> 1;
+		hwc->sample_period  = arm_pmu_event_max_period(event) >> 1;
 		hwc->last_period    = hwc->sample_period;
 		local64_set(&hwc->period_left, hwc->sample_period);
 	}
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index f7126a21df30..10f92e1d8e7b 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -25,6 +25,12 @@
  */
 #define ARMPMU_MAX_HWEVENTS		32
 
+/*
+ * ARM PMU hw_event flags
+ */
+/* Event uses a 64bit counter */
+#define ARMPMU_EVT_64BIT		1
+
 #define HW_OP_UNSUPPORTED		0xFFFF
 #define C(_x)				PERF_COUNT_HW_CACHE_##_x
 #define CACHE_OP_UNSUPPORTED		0xFFFF
-- 
cgit v1.2.3


From 14b470b56840dbb093abd71f214e9d63770c87b8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 6 Jul 2018 22:19:07 +0200
Subject: scsi: target: sbitmap: add seq_file forward declaration

The target core runs into a warning in the linux/sbitmap.h
file in some configurations:

In file included from include/target/target_core_base.h:7,
                 from drivers/target/target_core_fabric_lib.c:41:
include/linux/sbitmap.h:331:46: error: 'struct seq_file' declared inside parameter list will not be visible outside of this definition or declaration [-Werror]
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m);
                                              ^~~~~~~~

In general, headers should not depend on others being included first,
so this fixes it with a forward declaration for that struct name, but
we probably want to merge the patch through the scsi tree to help
bisection.

Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use sbitmap")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/sbitmap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e6539536dea9..804a50983ec5 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -23,6 +23,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
+struct seq_file;
+
 /**
  * struct sbitmap_word - Word in a &struct sbitmap.
  */
-- 
cgit v1.2.3


From b60a60405fb95a688eb2ef4ef20f5fcaa7b64f68 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Fri, 6 Jul 2018 17:37:19 +0200
Subject: netfilter: Add nf_ct_get_tuple_skb global lookup function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a global netfilter function to extract a conntrack tuple from an
skb. The function uses a new function added to nf_ct_hook, which will try
to get the tuple from skb->_nfct, and do a full lookup if that fails. This
makes it possible to use the lookup function before the skb has passed
through the conntrack init hooks (e.g., in an ingress qdisc). The tuple is
copied to the caller to avoid issues with reference counting.

The function returns false if conntrack is not loaded, allowing it to be
used without incurring a module dependency on conntrack. This is used by
the NAT mode in sch_cake.

Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h         | 11 +++++++++++
 net/netfilter/core.c              | 15 +++++++++++++++
 net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 23b48de8c2e2..07efffd0c759 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -414,8 +414,17 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
+struct nf_conntrack_tuple;
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+			 const struct sk_buff *skb);
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+struct nf_conntrack_tuple;
+static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+				       const struct sk_buff *skb)
+{
+	return false;
+}
 #endif
 
 struct nf_conn;
@@ -424,6 +433,8 @@ enum ip_conntrack_info;
 struct nf_ct_hook {
 	int (*update)(struct net *net, struct sk_buff *skb);
 	void (*destroy)(struct nf_conntrack *);
+	bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
+			      const struct sk_buff *);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..dc240cb47ddf 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
 }
 EXPORT_SYMBOL(nf_conntrack_destroy);
 
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+			 const struct sk_buff *skb)
+{
+	struct nf_ct_hook *ct_hook;
+	bool ret = false;
+
+	rcu_read_lock();
+	ct_hook = rcu_dereference(nf_ct_hook);
+	if (ct_hook)
+		ret = ct_hook->get_tuple_skb(dst_tuple, skb);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_ct_get_tuple_skb);
+
 /* Built-in default zone used e.g. by modules. */
 const struct nf_conntrack_zone nf_ct_zone_dflt = {
 	.id	= NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3465da2a98bd..85ab2fd6a665 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+				       const struct sk_buff *skb)
+{
+	const struct nf_conntrack_tuple *src_tuple;
+	const struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple srctuple;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
+		memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+		return true;
+	}
+
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+			       NFPROTO_IPV4, dev_net(skb->dev),
+			       &srctuple))
+		return false;
+
+	hash = nf_conntrack_find_get(dev_net(skb->dev),
+				     &nf_ct_zone_dflt,
+				     &srctuple);
+	if (!hash)
+		return false;
+
+	ct = nf_ct_tuplehash_to_ctrack(hash);
+	src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
+	memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+	nf_ct_put(ct);
+
+	return true;
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -2204,6 +2239,7 @@ err_cachep:
 static struct nf_ct_hook nf_conntrack_hook = {
 	.update		= nf_conntrack_update,
 	.destroy	= destroy_conntrack,
+	.get_tuple_skb  = nf_conntrack_get_tuple_skb,
 };
 
 void nf_conntrack_init_end(void)
-- 
cgit v1.2.3


From 19f391eb05b8b005f2907ddc8f284487b446abf3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 11:19:32 -0400
Subject: turn filp_clone_open() into inline wrapper for dentry_open()

it's exactly the same thing as
	dentry_open(&file->f_path, file->f_flags, file->f_cred)

... and rename it to file_clone_open(), while we are at it.
'filp' naming convention is bogus; sure, it's "file pointer",
but we generally don't do that kind of Hungarian notation.
Some of the instances have too many callers to touch, but this
one has only two, so let's sanitize it while we can...

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/gpu/drm/drm_lease.c |  2 +-
 fs/binfmt_misc.c            |  2 +-
 fs/open.c                   | 20 --------------------
 include/linux/fs.h          |  5 ++++-
 4 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c
index d638c0fb3418..b54fb78a283c 100644
--- a/drivers/gpu/drm/drm_lease.c
+++ b/drivers/gpu/drm/drm_lease.c
@@ -553,7 +553,7 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev,
 
 	/* Clone the lessor file to create a new file for us */
 	DRM_DEBUG_LEASE("Allocating lease file\n");
-	lessee_file = filp_clone_open(lessor_file);
+	lessee_file = file_clone_open(lessor_file);
 	if (IS_ERR(lessee_file)) {
 		ret = PTR_ERR(lessee_file);
 		goto out_lessee;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 4b5fff31ef27..aa4a7a23ff99 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm)
 		goto error;
 
 	if (fmt->flags & MISC_FMT_OPEN_FILE) {
-		interp_file = filp_clone_open(fmt->interp_file);
+		interp_file = file_clone_open(fmt->interp_file);
 		if (!IS_ERR(interp_file))
 			deny_write_access(interp_file);
 	} else {
diff --git a/fs/open.c b/fs/open.c
index d0e955b558ad..76c56966e297 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1063,26 +1063,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(file_open_root);
 
-struct file *filp_clone_open(struct file *oldfile)
-{
-	struct file *file;
-	int retval;
-
-	file = get_empty_filp();
-	if (IS_ERR(file))
-		return file;
-
-	file->f_flags = oldfile->f_flags;
-	retval = vfs_open(&oldfile->f_path, file, oldfile->f_cred);
-	if (retval) {
-		put_filp(file);
-		return ERR_PTR(retval);
-	}
-
-	return file;
-}
-EXPORT_SYMBOL(filp_clone_open);
-
 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa9b4c169ed2..c4ca4c9c1130 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2422,7 +2422,10 @@ extern struct file *filp_open(const char *, int, umode_t);
 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 				   const char *, int, umode_t);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
-extern struct file *filp_clone_open(struct file *);
+static inline struct file *file_clone_open(struct file *file)
+{
+	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
+}
 extern int filp_close(struct file *, fl_owner_t id);
 
 extern struct filename *getname_flags(const char __user *, int, int *);
-- 
cgit v1.2.3


From 9dc55f1389f9569acf9659e58dd836a9c70df217 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jul 2018 22:26:05 -0700
Subject: iomap: add support for sub-pagesize buffered I/O without buffer heads

After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 280 ++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/iomap.h |  31 ++++++
 2 files changed, 290 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index 13cdcf33e6c0..07501a647d13 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@
 #include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
@@ -104,6 +105,138 @@ iomap_sector(struct iomap *iomap, loff_t pos)
 	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (iop || i_blocksize(inode) == PAGE_SIZE)
+		return iop;
+
+	iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&iop->read_count, 0);
+	atomic_set(&iop->write_count, 0);
+	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+	set_page_private(page, (unsigned long)iop);
+	SetPagePrivate(page);
+	return iop;
+}
+
+static void
+iomap_page_release(struct page *page)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (!iop)
+		return;
+	WARN_ON_ONCE(atomic_read(&iop->read_count));
+	WARN_ON_ONCE(atomic_read(&iop->write_count));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	kfree(iop);
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
+		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+	unsigned block_bits = inode->i_blkbits;
+	unsigned block_size = (1 << block_bits);
+	unsigned poff = *pos & (PAGE_SIZE - 1);
+	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	unsigned first = poff >> block_bits;
+	unsigned last = (poff + plen - 1) >> block_bits;
+	unsigned end = (i_size_read(inode) & (PAGE_SIZE - 1)) >> block_bits;
+
+	/*
+	 * If the block size is smaller than the page size we need to check the
+	 * per-block uptodate status and adjust the offset and length if needed
+	 * to avoid reading in already uptodate ranges.
+	 */
+	if (iop) {
+		unsigned int i;
+
+		/* move forward for each leading block marked uptodate */
+		for (i = first; i <= last; i++) {
+			if (!test_bit(i, iop->uptodate))
+				break;
+			*pos += block_size;
+			poff += block_size;
+			plen -= block_size;
+			first++;
+		}
+
+		/* truncate len if we find any trailing uptodate block(s) */
+		for ( ; i <= last; i++) {
+			if (test_bit(i, iop->uptodate)) {
+				plen -= (last - i + 1) * block_size;
+				last = i - 1;
+				break;
+			}
+		}
+	}
+
+	/*
+	 * If the extent spans the block that contains the i_size we need to
+	 * handle both halves separately so that we properly zero data in the
+	 * page cache for blocks that are entirely outside of i_size.
+	 */
+	if (first <= end && last > end)
+		plen -= (last - end) * block_size;
+
+	*offp = poff;
+	*lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = off >> inode->i_blkbits;
+	unsigned last = (off + len - 1) >> inode->i_blkbits;
+	unsigned int i;
+	bool uptodate = true;
+
+	if (iop) {
+		for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+			if (i >= first && i <= last)
+				set_bit(i, iop->uptodate);
+			else if (!test_bit(i, iop->uptodate))
+				uptodate = false;
+		}
+	}
+
+	if (uptodate && !PageError(page))
+		SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+	if (!iop || atomic_dec_and_test(&iop->read_count))
+		unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+	struct page *page = bvec->bv_page;
+	struct iomap_page *iop = to_iomap_page(page);
+
+	if (unlikely(error)) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
+	}
+
+	iomap_read_finish(iop, page);
+}
+
 static void
 iomap_read_inline_data(struct inode *inode, struct page *page,
 		struct iomap *iomap)
@@ -132,7 +265,7 @@ iomap_read_end_io(struct bio *bio)
 	int i;
 
 	bio_for_each_segment_all(bvec, bio, i)
-		page_endio(bvec->bv_page, false, error);
+		iomap_read_page_end_io(bvec, error);
 	bio_put(bio);
 }
 
@@ -150,9 +283,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 {
 	struct iomap_readpage_ctx *ctx = data;
 	struct page *page = ctx->cur_page;
-	unsigned poff = pos & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	bool is_contig = false;
+	loff_t orig_pos = pos;
+	unsigned poff, plen;
 	sector_t sector;
 
 	if (iomap->type == IOMAP_INLINE) {
@@ -161,13 +295,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		return PAGE_SIZE;
 	}
 
-	/* we don't support blocksize < PAGE_SIZE quite yet. */
-	WARN_ON_ONCE(pos != page_offset(page));
-	WARN_ON_ONCE(plen != PAGE_SIZE);
+	/* zero post-eof blocks as the page may be mapped */
+	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
+	if (plen == 0)
+		goto done;
 
 	if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
 		zero_user(page, poff, plen);
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, poff, plen);
 		goto done;
 	}
 
@@ -183,6 +318,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		is_contig = true;
 	}
 
+	/*
+	 * If we start a new segment we need to increase the read count, and we
+	 * need to do so before submitting any previous full bio to make sure
+	 * that we don't prematurely unlock the page.
+	 */
+	if (iop)
+		atomic_inc(&iop->read_count);
+
 	if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -203,7 +346,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 	__bio_add_page(ctx->bio, page, plen, poff);
 done:
-	return plen;
+	/*
+	 * Move the caller beyond our range so that it keeps making progress.
+	 * For that we have to include any leading non-uptodate ranges, but
+	 * we can skip trailing ones as they will be handled in the next
+	 * iteration.
+	 */
+	return pos - orig_pos + plen;
 }
 
 int
@@ -214,8 +363,6 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
 	unsigned poff;
 	loff_t ret;
 
-	WARN_ON_ONCE(page_has_buffers(page));
-
 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 		ret = iomap_apply(inode, page_offset(page) + poff,
 				PAGE_SIZE - poff, 0, ops, &ctx,
@@ -341,6 +488,84 @@ done:
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+int
+iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count)
+{
+	struct iomap_page *iop = to_iomap_page(page);
+	struct inode *inode = page->mapping->host;
+	unsigned first = from >> inode->i_blkbits;
+	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned i;
+
+	if (iop) {
+		for (i = first; i <= last; i++)
+			if (!test_bit(i, iop->uptodate))
+				return 0;
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+
+int
+iomap_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(), skip those here.
+	 */
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+	iomap_page_release(page);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(iomap_releasepage);
+
+void
+iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
+{
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * and release it to avoid unnecessary buildup of the LRU.
+	 */
+	if (offset == 0 && len == PAGE_SIZE) {
+		WARN_ON_ONCE(PageWriteback(page));
+		cancel_dirty_page(page);
+		iomap_page_release(page);
+	}
+}
+EXPORT_SYMBOL_GPL(iomap_invalidatepage);
+
+#ifdef CONFIG_MIGRATION
+int
+iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (page_has_private(page)) {
+		ClearPagePrivate(page);
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+		SetPagePrivate(newpage);
+	}
+
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(iomap_migrate_page);
+#endif /* CONFIG_MIGRATION */
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -364,6 +589,7 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
 
 	if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
 		zero_user_segments(page, poff, from, to, poff + plen);
+		iomap_set_range_uptodate(page, poff, plen);
 		return 0;
 	}
 
@@ -379,21 +605,33 @@ static int
 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 		struct page *page, struct iomap *iomap)
 {
+	struct iomap_page *iop = iomap_page_create(inode, page);
 	loff_t block_size = i_blocksize(inode);
 	loff_t block_start = pos & ~(block_size - 1);
 	loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
-	unsigned poff = block_start & (PAGE_SIZE - 1);
-	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
-	unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-
-	WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+	unsigned from = pos & (PAGE_SIZE - 1), to = from + len, poff, plen;
+	int status = 0;
 
 	if (PageUptodate(page))
 		return 0;
-	if (from <= poff && to >= poff + plen)
-		return 0;
-	return iomap_read_page_sync(inode, block_start, page,
-			poff, plen, from, to, iomap);
+
+	do {
+		iomap_adjust_read_range(inode, iop, &block_start,
+				block_end - block_start, &poff, &plen);
+		if (plen == 0)
+			break;
+
+		if ((from > poff && from < poff + plen) ||
+		    (to > poff && to < poff + plen)) {
+			status = iomap_read_page_sync(inode, block_start, page,
+					poff, plen, from, to, iomap);
+			if (status)
+				break;
+		}
+
+	} while ((block_start += plen) < block_end);
+
+	return status;
 }
 
 static int
@@ -476,7 +714,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 	if (unlikely(copied < len && !PageUptodate(page))) {
 		copied = 0;
 	} else {
-		SetPageUptodate(page);
+		iomap_set_range_uptodate(page, pos & (PAGE_SIZE - 1), len);
 		iomap_set_page_dirty(page);
 	}
 	return __generic_write_end(inode, pos, copied, page);
@@ -812,7 +1050,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 		block_commit_write(page, 0, length);
 	} else {
 		WARN_ON_ONCE(!PageUptodate(page));
-		WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+		iomap_page_create(inode, page);
 	}
 
 	return length;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5eb9ca8d7ce5..3555d54bf79a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -2,6 +2,9 @@
 #ifndef LINUX_IOMAP_H
 #define LINUX_IOMAP_H 1
 
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/mm.h>
 #include <linux/types.h>
 
 struct address_space;
@@ -98,12 +101,40 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Structure allocate for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+	atomic_t		read_count;
+	atomic_t		write_count;
+	DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+	if (page_has_private(page))
+		return (struct iomap_page *)page_private(page);
+	return NULL;
+}
+
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 		unsigned nr_pages, const struct iomap_ops *ops);
 int iomap_set_page_dirty(struct page *page);
+int iomap_is_partially_uptodate(struct page *page, unsigned long from,
+		unsigned long count);
+int iomap_releasepage(struct page *page, gfp_t gfp_mask);
+void iomap_invalidatepage(struct page *page, unsigned int offset,
+		unsigned int len);
+#ifdef CONFIG_MIGRATION
+int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode);
+#else
+#define iomap_migrate_page NULL
+#endif
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-- 
cgit v1.2.3


From 3443b00e07eed5798605ba6524de37e1d3f1a4bf Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Tue, 10 Jul 2018 10:02:57 +0300
Subject: team: Publish team_port_get_rcu()

A follow-up patch adds a new entry point, team_port_dev_txable(). Making
it an ordinary exported function would mean that any module that may
need the service in one of the supported configurations also
unconditionally needs to pull in the team module, whether or not the
user actually intends to create team interfaces.

To prevent that, team_port_dev_txable() is defined in if_team.h, and
therefore all dependencies of that function also need to be
publicly-visible.

Therefore move team_port_get_rcu() from team.c to if_team.h.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 5 -----
 include/linux/if_team.h | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 3a95eaae0c98..6a047d30e8c6 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -41,11 +41,6 @@
 
 #define team_port_exists(dev) (dev->priv_flags & IFF_TEAM_PORT)
 
-static struct team_port *team_port_get_rcu(const struct net_device *dev)
-{
-	return rcu_dereference(dev->rx_handler_data);
-}
-
 static struct team_port *team_port_get_rtnl(const struct net_device *dev)
 {
 	struct team_port *port = rtnl_dereference(dev->rx_handler_data);
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index d95cae09dea0..0d07c6655cce 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -74,6 +74,11 @@ struct team_port {
 	long mode_priv[0];
 };
 
+static inline struct team_port *team_port_get_rcu(const struct net_device *dev)
+{
+	return rcu_dereference(dev->rx_handler_data);
+}
+
 static inline bool team_port_enabled(struct team_port *port)
 {
 	return port->index != -1;
-- 
cgit v1.2.3


From eeed992b776c54af6108187c87ac60d028e69d37 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Tue, 10 Jul 2018 10:02:58 +0300
Subject: net: Add lag.h, net_lag_port_dev_txable()

LAG devices (team or bond) recognize for each one of their slave devices
whether LAG traffic is going to be sent through that device. Bond calls
such devices "active", team calls them "txable". When this state
changes, a NETDEV_CHANGELOWERSTATE notification is distributed, together
with a netdev_notifier_changelowerstate_info structure that for LAG
devices includes a tx_enabled flag that refers to the new state. The
notification thus makes it possible to react to the changes in txability
in drivers.

However there's no way to query txability from the outside on demand.
That is problematic namely for mlxsw, which when resolving ERSPAN packet
path, may encounter a LAG device, and needs to determine which of the
slaves it should choose.

To that end, introduce a new function, net_lag_port_dev_txable(), which
determines whether a given slave device is "active" or
"txable" (depending on the flavor of the LAG device). That function then
dispatches to per-LAG-flavor helpers, bond_is_active_slave_dev() resp.
team_port_dev_txable().

Because there currently is no good place where net_lag_port_dev_txable()
should be added, introduce a new header file, lag.h, which should from
now on hold any logic common to both team and bond. (But keep
netif_is_lag_master() together with the rest of netif_is_*_master()
functions).

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_team.h | 13 +++++++++++++
 include/net/bonding.h   | 13 +++++++++++++
 include/net/lag.h       | 17 +++++++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 include/net/lag.h

(limited to 'include/linux')

diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 0d07c6655cce..ac42da56f7a2 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -89,6 +89,19 @@ static inline bool team_port_txable(struct team_port *port)
 	return port->linkup && team_port_enabled(port);
 }
 
+static inline bool team_port_dev_txable(const struct net_device *port_dev)
+{
+	struct team_port *port;
+	bool txable;
+
+	rcu_read_lock();
+	port = team_port_get_rcu(port_dev);
+	txable = port ? team_port_txable(port) : false;
+	rcu_read_unlock();
+
+	return txable;
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static inline void team_netpoll_send_skb(struct team_port *port,
 					 struct sk_buff *skb)
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 808f1d167349..a2d058170ea3 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -411,6 +411,19 @@ static inline bool bond_slave_can_tx(struct slave *slave)
 	       bond_is_active_slave(slave);
 }
 
+static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev)
+{
+	struct slave *slave;
+	bool active;
+
+	rcu_read_lock();
+	slave = bond_slave_get_rcu(slave_dev);
+	active = bond_is_active_slave(slave);
+	rcu_read_unlock();
+
+	return active;
+}
+
 static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len)
 {
 	if (len == ETH_ALEN) {
diff --git a/include/net/lag.h b/include/net/lag.h
new file mode 100644
index 000000000000..95b880e6fdde
--- /dev/null
+++ b/include/net/lag.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IF_LAG_H
+#define _LINUX_IF_LAG_H
+
+#include <linux/netdevice.h>
+#include <linux/if_team.h>
+#include <net/bonding.h>
+
+static inline bool net_lag_port_dev_txable(const struct net_device *port_dev)
+{
+	if (netif_is_team_port(port_dev))
+		return team_port_dev_txable(port_dev);
+	else
+		return bond_is_active_slave_dev(port_dev);
+}
+
+#endif /* _LINUX_IF_LAG_H */
-- 
cgit v1.2.3


From bf1c77b4644f46d2986b7ca5e43e012f0fc8984b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 11 Jul 2018 14:56:50 +0100
Subject: kernel: add ksys_personality()

Using this helper allows us to avoid the in-kernel call to the
sys_personality() syscall. The ksys_ prefix denotes that this function
is meant as a drop-in replacement for the syscall. In particular, it
uses the same calling convention as sys_personality().

Since ksys_personality is trivial, it is implemented directly in
<linux/syscalls.h>, as we do for ksys_close() and friends.

This helper is necessary to enable conversion of arm64's syscall
handling to use pt_regs wrappers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Martin <dave.martin@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/syscalls.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a368a68cb667..abfe12d8a9c5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -80,6 +80,7 @@ union bpf_attr;
 #include <linux/unistd.h>
 #include <linux/quota.h>
 #include <linux/key.h>
+#include <linux/personality.h>
 #include <trace/syscall.h>
 
 #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -1281,4 +1282,14 @@ static inline long ksys_truncate(const char __user *pathname, loff_t length)
 	return do_sys_truncate(pathname, length);
 }
 
+static inline unsigned int ksys_personality(unsigned int personality)
+{
+	unsigned int old = current->personality;
+
+	if (personality != 0xffffffff)
+		set_personality(personality);
+
+	return old;
+}
+
 #endif
-- 
cgit v1.2.3


From 9b54bf9d6a5b30e2cc22b18793e9a4158c5b4882 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 11 Jul 2018 14:56:51 +0100
Subject: kernel: add kcompat_sys_{f,}statfs64()

Using this helper allows us to avoid the in-kernel calls to the
compat_sys_{f,}statfs64() sycalls, as are necessary for parameter
mangling in arm64's compat handling.

Following the example of ksys_* functions, kcompat_sys_* functions are
intended to be a drop-in replacement for their compat_sys_*
counterparts, with the same calling convention.

This is necessary to enable conversion of arm64's syscall handling to
use pt_regs wrappers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 fs/statfs.c            | 14 ++++++++++++--
 include/linux/compat.h | 11 +++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/statfs.c b/fs/statfs.c
index 5b2a24f0f263..f0216629621d 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -335,7 +335,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, struct compat_statfs64 __user * buf)
 {
 	struct kstatfs tmp;
 	int error;
@@ -349,7 +349,12 @@ COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, s
 	return error;
 }
 
-COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	return kcompat_sys_statfs64(pathname, sz, buf);
+}
+
+int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf)
 {
 	struct kstatfs tmp;
 	int error;
@@ -363,6 +368,11 @@ COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct co
 	return error;
 }
 
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	return kcompat_sys_fstatfs64(fd, sz, buf);
+}
+
 /*
  * This is a copy of sys_ustat, just dealing with a structure layout.
  * Given how simple this syscall is that apporach is more maintainable
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c68acc47da57..43f4ed44c5d5 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -1028,6 +1028,17 @@ static inline struct compat_timeval ns_to_compat_timeval(s64 nsec)
 	return ctv;
 }
 
+/*
+ * Kernel code should not call compat syscalls (i.e., compat_sys_xyzyyz())
+ * directly.  Instead, use one of the functions which work equivalently, such
+ * as the kcompat_sys_xyzyyz() functions prototyped below.
+ */
+
+int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz,
+		     struct compat_statfs64 __user * buf);
+int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
+			  struct compat_statfs64 __user * buf);
+
 #else /* !CONFIG_COMPAT */
 
 #define is_compat_task() (0)
-- 
cgit v1.2.3


From c9c554f21490bbc96cc554f80024d27d09670480 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 11 Jul 2018 14:19:04 -0400
Subject: alloc_file(): switch to passing O_... flags instead of FMODE_... mode

... so that it could set both ->f_flags and ->f_mode, without callers
having to set ->f_flags manually.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/misc/cxl/api.c          |  3 +--
 drivers/scsi/cxlflash/ocxl_hw.c |  3 +--
 fs/aio.c                        |  8 ++------
 fs/anon_inodes.c                |  3 +--
 fs/file_table.c                 | 17 +++++++++--------
 fs/hugetlbfs/inode.c            |  3 +--
 fs/pipe.c                       |  8 ++++----
 include/linux/file.h            |  2 +-
 ipc/shm.c                       |  8 ++++----
 mm/memfd.c                      |  2 +-
 mm/shmem.c                      |  3 +--
 net/socket.c                    |  3 +--
 12 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 6b16946f9b05..0b5cb6cf91a0 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -102,12 +102,11 @@ static struct file *cxl_getfile(const char *name,
 	path.mnt = mntget(cxl_vfs_mount);
 	d_instantiate(path.dentry, inode);
 
-	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	file = alloc_file(&path, flags & (O_ACCMODE | O_NONBLOCK), fops);
 	if (IS_ERR(file)) {
 		path_put(&path);
 		goto err_fs;
 	}
-	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = priv;
 
 	return file;
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 497a68389461..99bb393a8a34 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -129,7 +129,7 @@ static struct file *ocxlflash_getfile(struct device *dev, const char *name,
 	path.mnt = mntget(ocxlflash_vfs_mount);
 	d_instantiate(path.dentry, inode);
 
-	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	file = alloc_file(&path, flags & (O_ACCMODE | O_NONBLOCK), fops);
 	if (IS_ERR(file)) {
 		rc = PTR_ERR(file);
 		dev_err(dev, "%s: alloc_file failed rc=%d\n",
@@ -138,7 +138,6 @@ static struct file *ocxlflash_getfile(struct device *dev, const char *name,
 		goto err3;
 	}
 
-	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = priv;
 out:
 	return file;
diff --git a/fs/aio.c b/fs/aio.c
index e1d20124ec0e..9eea53887d6c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -234,13 +234,9 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 	path.mnt = mntget(aio_mnt);
 
 	d_instantiate(path.dentry, inode);
-	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
-	if (IS_ERR(file)) {
+	file = alloc_file(&path, O_RDWR, &aio_ring_fops);
+	if (IS_ERR(file))
 		path_put(&path);
-		return file;
-	}
-
-	file->f_flags = O_RDWR;
 	return file;
 }
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3168ee4e77f4..6b235ab1df6c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -102,12 +102,11 @@ struct file *anon_inode_getfile(const char *name,
 
 	d_instantiate(path.dentry, anon_inode_inode);
 
-	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	file = alloc_file(&path, flags & (O_ACCMODE | O_NONBLOCK), fops);
 	if (IS_ERR(file))
 		goto err_dput;
 	file->f_mapping = anon_inode_inode->i_mapping;
 
-	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = priv;
 
 	return file;
diff --git a/fs/file_table.c b/fs/file_table.c
index eee7cf629e52..086c3f5ec31a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -153,10 +153,10 @@ over:
  * alloc_file - allocate and initialize a 'struct file'
  *
  * @path: the (dentry, vfsmount) pair for the new file
- * @mode: the mode with which the new file will be opened
+ * @flags: O_... flags with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-struct file *alloc_file(const struct path *path, fmode_t mode,
+struct file *alloc_file(const struct path *path, int flags,
 		const struct file_operations *fop)
 {
 	struct file *file;
@@ -165,19 +165,20 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
 	if (IS_ERR(file))
 		return file;
 
+	file->f_mode = OPEN_FMODE(flags);
+	file->f_flags = flags;
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
 	file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
-	if ((mode & FMODE_READ) &&
+	if ((file->f_mode & FMODE_READ) &&
 	     likely(fop->read || fop->read_iter))
-		mode |= FMODE_CAN_READ;
-	if ((mode & FMODE_WRITE) &&
+		file->f_mode |= FMODE_CAN_READ;
+	if ((file->f_mode & FMODE_WRITE) &&
 	     likely(fop->write || fop->write_iter))
-		mode |= FMODE_CAN_WRITE;
-	file->f_mode = mode;
+		file->f_mode |= FMODE_CAN_WRITE;
 	file->f_op = fop;
-	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
 	return file;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d508c7844681..71aed47422e2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1375,8 +1375,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	inode->i_size = size;
 	clear_nlink(inode);
 
-	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
-			&hugetlbfs_file_operations);
+	file = alloc_file(&path, O_RDWR, &hugetlbfs_file_operations);
 	if (IS_ERR(file))
 		goto out_dentry; /* inode is already attached */
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 9405e455f5b1..1909422e5a78 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -760,16 +760,17 @@ int create_pipe_files(struct file **res, int flags)
 
 	d_instantiate(path.dentry, inode);
 
-	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
+	f = alloc_file(&path, O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
+			&pipefifo_fops);
 	if (IS_ERR(f)) {
 		err = PTR_ERR(f);
 		goto err_dentry;
 	}
 
-	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 	f->private_data = inode->i_pipe;
 
-	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
+	res[0] = alloc_file(&path, O_RDONLY | (flags & O_NONBLOCK),
+			&pipefifo_fops);
 	if (IS_ERR(res[0])) {
 		put_pipe_info(inode, inode->i_pipe);
 		fput(f);
@@ -778,7 +779,6 @@ int create_pipe_files(struct file **res, int flags)
 
 	path_get(&path);
 	res[0]->private_data = inode->i_pipe;
-	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	res[1] = f;
 	return 0;
 
diff --git a/include/linux/file.h b/include/linux/file.h
index 279720db984a..6d34a1262b31 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -18,7 +18,7 @@ struct file_operations;
 struct vfsmount;
 struct dentry;
 struct path;
-extern struct file *alloc_file(const struct path *, fmode_t mode,
+extern struct file *alloc_file(const struct path *, int flags,
 	const struct file_operations *fop);
 
 static inline void fput_light(struct file *file, int fput_needed)
diff --git a/ipc/shm.c b/ipc/shm.c
index 051a3e1fb8df..c702abd578a7 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1362,7 +1362,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 	struct ipc_namespace *ns;
 	struct shm_file_data *sfd;
 	struct path path;
-	fmode_t f_mode;
+	int f_flags;
 	unsigned long populate = 0;
 
 	err = -EINVAL;
@@ -1395,11 +1395,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 	if (shmflg & SHM_RDONLY) {
 		prot = PROT_READ;
 		acc_mode = S_IRUGO;
-		f_mode = FMODE_READ;
+		f_flags = O_RDONLY;
 	} else {
 		prot = PROT_READ | PROT_WRITE;
 		acc_mode = S_IRUGO | S_IWUGO;
-		f_mode = FMODE_READ | FMODE_WRITE;
+		f_flags = O_RDWR;
 	}
 	if (shmflg & SHM_EXEC) {
 		prot |= PROT_EXEC;
@@ -1449,7 +1449,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 		goto out_nattch;
 	}
 
-	file = alloc_file(&path, f_mode,
+	file = alloc_file(&path, f_flags,
 			  is_file_hugepages(shp->shm_file) ?
 				&shm_file_operations_huge :
 				&shm_file_operations);
diff --git a/mm/memfd.c b/mm/memfd.c
index 27069518e3c5..2bb5e257080e 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create,
 		goto err_fd;
 	}
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
-	file->f_flags |= O_RDWR | O_LARGEFILE;
+	file->f_flags |= O_LARGEFILE;
 
 	if (flags & MFD_ALLOW_SEALING) {
 		file_seals = memfd_file_seals_ptr(file);
diff --git a/mm/shmem.c b/mm/shmem.c
index 2cab84403055..84844e52bf24 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3942,8 +3942,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
 	if (IS_ERR(res))
 		goto put_path;
 
-	res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
-		  &shmem_file_operations);
+	res = alloc_file(&path, O_RDWR, &shmem_file_operations);
 	if (IS_ERR(res))
 		goto put_path;
 
diff --git a/net/socket.c b/net/socket.c
index 8a109012608a..2cdbe8f71b7f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -411,7 +411,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 
 	d_instantiate(path.dentry, SOCK_INODE(sock));
 
-	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
+	file = alloc_file(&path, O_RDWR | (flags & O_NONBLOCK),
 		  &socket_file_ops);
 	if (IS_ERR(file)) {
 		/* drop dentry, keep inode for a bit */
@@ -423,7 +423,6 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 	}
 
 	sock->file = file;
-	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 	file->private_data = sock;
 	return file;
 }
-- 
cgit v1.2.3


From e3f20ae21079ecac282df65d83865c5771f4bca0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jul 2018 13:25:29 -0400
Subject: security_file_open(): lose cred argument

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c                | 2 +-
 include/linux/security.h | 5 ++---
 security/security.c      | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 0a9f00b7f3d5..4c65edefa487 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f,
 		goto cleanup_all;
 	}
 
-	error = security_file_open(f, f->f_cred);
+	error = security_file_open(f);
 	if (error)
 		goto cleanup_all;
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 63030c85ee19..88d30fc975e7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -309,7 +309,7 @@ void security_file_set_fowner(struct file *file);
 int security_file_send_sigiotask(struct task_struct *tsk,
 				 struct fown_struct *fown, int sig);
 int security_file_receive(struct file *file);
-int security_file_open(struct file *file, const struct cred *cred);
+int security_file_open(struct file *file);
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
@@ -858,8 +858,7 @@ static inline int security_file_receive(struct file *file)
 	return 0;
 }
 
-static inline int security_file_open(struct file *file,
-				     const struct cred *cred)
+static inline int security_file_open(struct file *file)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 68f46d849abe..235b35f58a65 100644
--- a/security/security.c
+++ b/security/security.c
@@ -970,11 +970,11 @@ int security_file_receive(struct file *file)
 	return call_int_hook(file_receive, 0, file);
 }
 
-int security_file_open(struct file *file, const struct cred *cred)
+int security_file_open(struct file *file)
 {
 	int ret;
 
-	ret = call_int_hook(file_open, 0, file, cred);
+	ret = call_int_hook(file_open, 0, file, file->f_cred);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 9481769208b5e39b871ae4e89f5328c776ec38dc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jul 2018 14:13:18 -0400
Subject: ->file_open(): lose cred argument

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/lsm_hooks.h  | 2 +-
 security/apparmor/lsm.c    | 4 ++--
 security/security.c        | 2 +-
 security/selinux/hooks.c   | 4 ++--
 security/smack/smack_lsm.c | 6 +++---
 security/tomoyo/tomoyo.c   | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 8f1131c8dd54..a8ee106b865d 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1569,7 +1569,7 @@ union security_list_options {
 	int (*file_send_sigiotask)(struct task_struct *tsk,
 					struct fown_struct *fown, int sig);
 	int (*file_receive)(struct file *file);
-	int (*file_open)(struct file *file, const struct cred *cred);
+	int (*file_open)(struct file *file);
 
 	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
 	void (*task_free)(struct task_struct *task);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 74f17376202b..8b8b70620bbe 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -395,7 +395,7 @@ static int apparmor_inode_getattr(const struct path *path)
 	return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR);
 }
 
-static int apparmor_file_open(struct file *file, const struct cred *cred)
+static int apparmor_file_open(struct file *file)
 {
 	struct aa_file_ctx *fctx = file_ctx(file);
 	struct aa_label *label;
@@ -414,7 +414,7 @@ static int apparmor_file_open(struct file *file, const struct cred *cred)
 		return 0;
 	}
 
-	label = aa_get_newest_cred_label(cred);
+	label = aa_get_newest_cred_label(file->f_cred);
 	if (!unconfined(label)) {
 		struct inode *inode = file_inode(file);
 		struct path_cond cond = { inode->i_uid, inode->i_mode };
diff --git a/security/security.c b/security/security.c
index 235b35f58a65..5dce67070cdf 100644
--- a/security/security.c
+++ b/security/security.c
@@ -974,7 +974,7 @@ int security_file_open(struct file *file)
 {
 	int ret;
 
-	ret = call_int_hook(file_open, 0, file, file->f_cred);
+	ret = call_int_hook(file_open, 0, file);
 	if (ret)
 		return ret;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2b5ee5fbd652..18006be15713 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3862,7 +3862,7 @@ static int selinux_file_receive(struct file *file)
 	return file_has_perm(cred, file, file_to_av(file));
 }
 
-static int selinux_file_open(struct file *file, const struct cred *cred)
+static int selinux_file_open(struct file *file)
 {
 	struct file_security_struct *fsec;
 	struct inode_security_struct *isec;
@@ -3886,7 +3886,7 @@ static int selinux_file_open(struct file *file, const struct cred *cred)
 	 * new inode label or new policy.
 	 * This check is not redundant - do not remove.
 	 */
-	return file_path_has_perm(cred, file, open_file_to_av(file));
+	return file_path_has_perm(file->f_cred, file, open_file_to_av(file));
 }
 
 /* task security operations */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 7ad226018f51..e7b6c012431d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1927,9 +1927,9 @@ static int smack_file_receive(struct file *file)
  *
  * Returns 0
  */
-static int smack_file_open(struct file *file, const struct cred *cred)
+static int smack_file_open(struct file *file)
 {
-	struct task_smack *tsp = cred->security;
+	struct task_smack *tsp = file->f_cred->security;
 	struct inode *inode = file_inode(file);
 	struct smk_audit_info ad;
 	int rc;
@@ -1937,7 +1937,7 @@ static int smack_file_open(struct file *file, const struct cred *cred)
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
 	smk_ad_setfield_u_fs_path(&ad, file->f_path);
 	rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad);
-	rc = smk_bu_credfile(cred, file, MAY_READ, rc);
+	rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc);
 
 	return rc;
 }
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 213b8c593668..9f932e2d6852 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -320,7 +320,7 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
  *
  * Returns 0 on success, negative value otherwise.
  */
-static int tomoyo_file_open(struct file *f, const struct cred *cred)
+static int tomoyo_file_open(struct file *f)
 {
 	int flags = f->f_flags;
 	/* Don't check read permission here if called from do_execve(). */
-- 
cgit v1.2.3


From f5d11409e61dadf1f9af91b22bbedc28a60a2e2c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jul 2018 02:35:08 -0400
Subject: introduce FMODE_OPENED

basically, "is that instance set up enough for regular fput(), or
do we want put_filp() for that one".

NOTE: the only alloc_file() caller that could be followed by put_filp()
is in arch/ia64/kernel/perfmon.c, which is (Kconfig-level) broken.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 1 +
 fs/open.c          | 3 ++-
 include/linux/fs.h | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 705f486f7007..d664d10acfeb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -176,6 +176,7 @@ struct file *alloc_file(const struct path *path, int flags,
 	if ((file->f_mode & FMODE_WRITE) &&
 	     likely(fop->write || fop->write_iter))
 		file->f_mode |= FMODE_CAN_WRITE;
+	file->f_mode |= FMODE_OPENED;
 	file->f_op = fop;
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
diff --git a/fs/open.c b/fs/open.c
index 4c65edefa487..f3c6cb6a57b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -749,7 +749,7 @@ static int do_dentry_open(struct file *f,
 	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
 
 	if (unlikely(f->f_flags & O_PATH)) {
-		f->f_mode = FMODE_PATH;
+		f->f_mode = FMODE_PATH | FMODE_OPENED;
 		f->f_op = &empty_fops;
 		return 0;
 	}
@@ -793,6 +793,7 @@ static int do_dentry_open(struct file *f,
 		if (error)
 			goto cleanup_all;
 	}
+	f->f_mode |= FMODE_OPENED;
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(inode);
 	if ((f->f_mode & FMODE_READ) &&
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c4ca4c9c1130..05f34726e29c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -148,6 +148,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Has write method(s) */
 #define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
 
+#define FMODE_OPENED		((__force fmode_t)0x80000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
-- 
cgit v1.2.3


From 4d27f3266f14e4d1d13125ce32cb49a40f3122c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jul 2018 11:14:39 -0400
Subject: fold put_filp() into fput()

Just check FMODE_OPENED in __fput() and be done with that...

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c      | 15 +++++----------
 fs/namei.c           |  4 ++--
 fs/open.c            | 11 +++--------
 include/linux/file.h |  1 -
 4 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index d664d10acfeb..9b70ed2bbc4e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -192,6 +192,9 @@ static void __fput(struct file *file)
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
 
+	if (unlikely(!(file->f_mode & FMODE_OPENED)))
+		goto out;
+
 	might_sleep();
 
 	fsnotify_close(file);
@@ -221,12 +224,10 @@ static void __fput(struct file *file)
 		put_write_access(inode);
 		__mnt_drop_write(mnt);
 	}
-	file->f_path.dentry = NULL;
-	file->f_path.mnt = NULL;
-	file->f_inode = NULL;
-	file_free(file);
 	dput(dentry);
 	mntput(mnt);
+out:
+	file_free(file);
 }
 
 static LLIST_HEAD(delayed_fput_list);
@@ -301,12 +302,6 @@ void __fput_sync(struct file *file)
 
 EXPORT_SYMBOL(fput);
 
-void put_filp(struct file *file)
-{
-	if (atomic_long_dec_and_test(&file->f_count))
-		file_free(file);
-}
-
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
diff --git a/fs/namei.c b/fs/namei.c
index 3cf02804d5ff..503c4b968415 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3531,7 +3531,7 @@ static struct file *path_openat(struct nameidata *nd,
 
 	s = path_init(nd, flags);
 	if (IS_ERR(s)) {
-		put_filp(file);
+		fput(file);
 		return ERR_CAST(s);
 	}
 	while (!(error = link_path_walk(s, nd)) &&
@@ -3547,7 +3547,7 @@ static struct file *path_openat(struct nameidata *nd,
 out2:
 	if (!(opened & FILE_OPENED)) {
 		BUG_ON(!error);
-		put_filp(file);
+		fput(file);
 	}
 	if (unlikely(error)) {
 		if (error == -EOPENSTALE) {
diff --git a/fs/open.c b/fs/open.c
index f3c6cb6a57b9..3d09b823f12b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -921,15 +921,10 @@ struct file *dentry_open(const struct path *path, int flags,
 	f = alloc_empty_file(flags, cred);
 	if (!IS_ERR(f)) {
 		error = vfs_open(path, f);
-		if (!error) {
-			/* from now on we need fput() to dispose of f */
+		if (!error)
 			error = open_check_o_direct(f);
-			if (error) {
-				fput(f);
-				f = ERR_PTR(error);
-			}
-		} else { 
-			put_filp(f);
+		if (error) {
+			fput(f);
 			f = ERR_PTR(error);
 		}
 	}
diff --git a/include/linux/file.h b/include/linux/file.h
index 6d34a1262b31..aed45d69811e 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -78,7 +78,6 @@ extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
 extern bool get_close_on_exec(unsigned int fd);
-extern void put_filp(struct file *);
 extern int get_unused_fd_flags(unsigned flags);
 extern void put_unused_fd(unsigned int fd);
 
-- 
cgit v1.2.3


From 73a09dd94377e4b186b300bd5461920710c7c3d5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 13:22:02 -0400
Subject: introduce FMODE_CREATED and switch to it

Parallel to FILE_CREATED, goes into ->f_mode instead of *opened.
NFS is a bit of a wart here - it doesn't have file at the point
where FILE_CREATED used to be set, so we need to propagate it
there (for now).  IMA is another one (here and everywhere)...

Note that this needs do_dentry_open() to leave old bits in ->f_mode
alone - we want it to preserve FMODE_CREATED if it had been already
set (no other bit can be there).

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c      |  2 +-
 fs/9p/vfs_inode_dotl.c |  2 +-
 fs/ceph/file.c         |  2 +-
 fs/cifs/dir.c          |  2 +-
 fs/fuse/dir.c          |  2 +-
 fs/gfs2/inode.c        |  2 +-
 fs/namei.c             | 15 ++++++++-------
 fs/nfs/dir.c           |  5 ++++-
 fs/nfs/nfs4proc.c      |  2 +-
 include/linux/fs.h     |  1 +
 10 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 42e102e2e74a..566929792480 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -925,7 +925,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(d_inode(dentry), file);
 
-	*opened |= FILE_CREATED;
+	file->f_mode |= FMODE_CREATED;
 out:
 	dput(res);
 	return err;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 7f6ae21a27b3..ee65db5c7eb0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -358,7 +358,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	file->private_data = ofid;
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
 		v9fs_cache_inode_set_cookie(inode, file);
-	*opened |= FILE_CREATED;
+	file->f_mode |= FMODE_CREATED;
 out:
 	v9fs_put_acl(dacl, pacl);
 	dput(res);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ad0bed99b1d5..38a63fff7903 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -507,7 +507,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		dout("atomic_open finish_open on dn %p\n", dn);
 		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
 			ceph_init_inode_acls(d_inode(dentry), &acls);
-			*opened |= FILE_CREATED;
+			file->f_mode |= FMODE_CREATED;
 		}
 		err = finish_open(file, dentry, ceph_open, opened);
 	}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ddae52bd1993..21d7e393900e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -539,7 +539,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 	}
 
 	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-		*opened |= FILE_CREATED;
+		file->f_mode |= FMODE_CREATED;
 
 	rc = finish_open(file, direntry, generic_file_open, opened);
 	if (rc) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 56231b31f806..d4bdcf51e6cb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -508,7 +508,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 		goto no_open;
 
 	/* Only creates */
-	*opened |= FILE_CREATED;
+	file->f_mode |= FMODE_CREATED;
 
 	if (fc->no_create)
 		goto mknod;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 67c588edf8d8..4aba00a6004b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -767,7 +767,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	mark_inode_dirty(inode);
 	d_instantiate(dentry, inode);
 	if (file) {
-		*opened |= FILE_CREATED;
+		file->f_mode |= FMODE_CREATED;
 		error = finish_open(file, dentry, gfs2_open_common, opened);
 	}
 	gfs2_glock_dq_uninit(ghs);
diff --git a/fs/namei.c b/fs/namei.c
index 8a1ae074c1c1..4bd7cc0d7522 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3061,7 +3061,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 		 * permission here.
 		 */
 		int acc_mode = op->acc_mode;
-		if (*opened & FILE_CREATED) {
+		if (file->f_mode & FMODE_CREATED) {
 			WARN_ON(!(open_flag & O_CREAT));
 			fsnotify_create(dir, dentry);
 			acc_mode = 0;
@@ -3077,7 +3077,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 				dput(dentry);
 				dentry = file->f_path.dentry;
 			}
-			if (*opened & FILE_CREATED)
+			if (file->f_mode & FMODE_CREATED)
 				fsnotify_create(dir, dentry);
 			if (unlikely(d_is_negative(dentry))) {
 				error = -ENOENT;
@@ -3126,7 +3126,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 	if (unlikely(IS_DEADDIR(dir_inode)))
 		return -ENOENT;
 
-	*opened &= ~FILE_CREATED;
+	file->f_mode &= ~FMODE_CREATED;
 	dentry = d_lookup(dir, &nd->last);
 	for (;;) {
 		if (!dentry) {
@@ -3211,7 +3211,7 @@ no_open:
 
 	/* Negative dentry, just create the file */
 	if (!dentry->d_inode && (open_flag & O_CREAT)) {
-		*opened |= FILE_CREATED;
+		file->f_mode |= FMODE_CREATED;
 		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 		if (!dir_inode->i_op->create) {
 			error = -EACCES;
@@ -3318,7 +3318,7 @@ static int do_last(struct nameidata *nd,
 		if (error)
 			goto out;
 
-		if ((*opened & FILE_CREATED) ||
+		if ((file->f_mode & FMODE_CREATED) ||
 		    !S_ISREG(file_inode(file)->i_mode))
 			will_truncate = false;
 
@@ -3326,7 +3326,7 @@ static int do_last(struct nameidata *nd,
 		goto opened;
 	}
 
-	if (*opened & FILE_CREATED) {
+	if (file->f_mode & FMODE_CREATED) {
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = false;
@@ -3400,7 +3400,8 @@ finish_open_created:
 	if (error)
 		goto out;
 opened:
-	error = ima_file_check(file, op->acc_mode, *opened);
+	error = ima_file_check(file, op->acc_mode,
+				file->f_mode & FMODE_CREATED ? FILE_CREATED : 0);
 	if (!error && will_truncate)
 		error = handle_truncate(file);
 out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7a9c14426855..0ac50983fc4e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1461,6 +1461,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	unsigned int lookup_flags = 0;
 	bool switched = false;
+	int created = 0;
 	int err;
 
 	/* Expect a negative dentry */
@@ -1521,7 +1522,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	trace_nfs_atomic_open_enter(dir, ctx, open_flags);
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created);
+	if (created)
+		file->f_mode |= FMODE_CREATED;
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed45090e4df6..2c4df0ffbca1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2951,7 +2951,7 @@ static int _nfs4_do_open(struct inode *dir,
 		}
 	}
 	if (opened && opendata->file_created)
-		*opened |= FILE_CREATED;
+		*opened = 1;
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
 		*ctx_th = opendata->f_attr.mdsthreshold;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 05f34726e29c..ca668c7e48a7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -149,6 +149,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
 
 #define FMODE_OPENED		((__force fmode_t)0x80000)
+#define FMODE_CREATED		((__force fmode_t)0x100000)
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
-- 
cgit v1.2.3


From 6035a27b25ab9dadc8c3d5c5df5eae3fca62fc95 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 13:40:10 -0400
Subject: IMA: don't propagate opened through the entire thing

just check ->f_mode in ima_appraise_measurement()

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c                            |  3 +--
 fs/nfsd/vfs.c                         |  2 +-
 include/linux/ima.h                   |  4 ++--
 security/integrity/ima/ima.h          |  4 ++--
 security/integrity/ima/ima_appraise.c |  4 ++--
 security/integrity/ima/ima_main.c     | 16 ++++++++--------
 6 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 4bd7cc0d7522..d2aeb282ed05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3400,8 +3400,7 @@ finish_open_created:
 	if (error)
 		goto out;
 opened:
-	error = ima_file_check(file, op->acc_mode,
-				file->f_mode & FMODE_CREATED ? FILE_CREATED : 0);
+	error = ima_file_check(file, op->acc_mode);
 	if (!error && will_truncate)
 		error = handle_truncate(file);
 out:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b0555d7d8200..55a099e47ba2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -763,7 +763,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		goto out_nfserr;
 	}
 
-	host_err = ima_file_check(file, may_flags, 0);
+	host_err = ima_file_check(file, may_flags);
 	if (host_err) {
 		fput(file);
 		goto out_nfserr;
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0e4647e0eb60..d9ba3fc363b7 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -16,7 +16,7 @@ struct linux_binprm;
 
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
-extern int ima_file_check(struct file *file, int mask, int opened);
+extern int ima_file_check(struct file *file, int mask);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
@@ -34,7 +34,7 @@ static inline int ima_bprm_check(struct linux_binprm *bprm)
 	return 0;
 }
 
-static inline int ima_file_check(struct file *file, int mask, int opened)
+static inline int ima_file_check(struct file *file, int mask)
 {
 	return 0;
 }
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 354bb5716ce3..e4c1a236976c 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -238,7 +238,7 @@ int ima_appraise_measurement(enum ima_hooks func,
 			     struct integrity_iint_cache *iint,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
-			     int xattr_len, int opened);
+			     int xattr_len);
 int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
 void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
@@ -254,7 +254,7 @@ static inline int ima_appraise_measurement(enum ima_hooks func,
 					   struct file *file,
 					   const unsigned char *filename,
 					   struct evm_ima_xattr_data *xattr_value,
-					   int xattr_len, int opened)
+					   int xattr_len)
 {
 	return INTEGRITY_UNKNOWN;
 }
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 8bd7a0733e51..deec1804a00a 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -212,7 +212,7 @@ int ima_appraise_measurement(enum ima_hooks func,
 			     struct integrity_iint_cache *iint,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
-			     int xattr_len, int opened)
+			     int xattr_len)
 {
 	static const char op[] = "appraise_data";
 	const char *cause = "unknown";
@@ -231,7 +231,7 @@ int ima_appraise_measurement(enum ima_hooks func,
 		cause = iint->flags & IMA_DIGSIG_REQUIRED ?
 				"IMA-signature-required" : "missing-hash";
 		status = INTEGRITY_NOLABEL;
-		if (opened & FILE_CREATED)
+		if (file->f_mode & FMODE_CREATED)
 			iint->flags |= IMA_NEW_FILE;
 		if ((iint->flags & IMA_NEW_FILE) &&
 		    (!(iint->flags & IMA_DIGSIG_REQUIRED) ||
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index dca44cf7838e..b286f37712d5 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -168,7 +168,7 @@ void ima_file_free(struct file *file)
 
 static int process_measurement(struct file *file, const struct cred *cred,
 			       u32 secid, char *buf, loff_t size, int mask,
-			       enum ima_hooks func, int opened)
+			       enum ima_hooks func)
 {
 	struct inode *inode = file_inode(file);
 	struct integrity_iint_cache *iint = NULL;
@@ -294,7 +294,7 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
 		inode_lock(inode);
 		rc = ima_appraise_measurement(func, iint, file, pathname,
-					      xattr_value, xattr_len, opened);
+					      xattr_value, xattr_len);
 		inode_unlock(inode);
 	}
 	if (action & IMA_AUDIT)
@@ -338,7 +338,7 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	if (file && (prot & PROT_EXEC)) {
 		security_task_getsecid(current, &secid);
 		return process_measurement(file, current_cred(), secid, NULL,
-					   0, MAY_EXEC, MMAP_CHECK, 0);
+					   0, MAY_EXEC, MMAP_CHECK);
 	}
 
 	return 0;
@@ -364,13 +364,13 @@ int ima_bprm_check(struct linux_binprm *bprm)
 
 	security_task_getsecid(current, &secid);
 	ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
-				  MAY_EXEC, BPRM_CHECK, 0);
+				  MAY_EXEC, BPRM_CHECK);
 	if (ret)
 		return ret;
 
 	security_cred_getsecid(bprm->cred, &secid);
 	return process_measurement(bprm->file, bprm->cred, secid, NULL, 0,
-				   MAY_EXEC, CREDS_CHECK, 0);
+				   MAY_EXEC, CREDS_CHECK);
 }
 
 /**
@@ -383,14 +383,14 @@ int ima_bprm_check(struct linux_binprm *bprm)
  * On success return 0.  On integrity appraisal error, assuming the file
  * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
  */
-int ima_file_check(struct file *file, int mask, int opened)
+int ima_file_check(struct file *file, int mask)
 {
 	u32 secid;
 
 	security_task_getsecid(current, &secid);
 	return process_measurement(file, current_cred(), secid, NULL, 0,
 				   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
-					   MAY_APPEND), FILE_CHECK, opened);
+					   MAY_APPEND), FILE_CHECK);
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
@@ -493,7 +493,7 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	func = read_idmap[read_id] ?: FILE_CHECK;
 	security_task_getsecid(current, &secid);
 	return process_measurement(file, current_cred(), secid, buf, size,
-				   MAY_READ, func, 0);
+				   MAY_READ, func);
 }
 
 static int __init init_ima(void)
-- 
cgit v1.2.3


From be12af3ef5e61ebc44d065e121424ac605d7bb8e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 11:44:56 -0400
Subject: getting rid of 'opened' argument of ->atomic_open() - part 1

'opened' argument of finish_open() is unused.  Kill it.

Signed-off-by Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c      | 2 +-
 fs/9p/vfs_inode_dotl.c | 2 +-
 fs/ceph/file.c         | 2 +-
 fs/cifs/dir.c          | 2 +-
 fs/fuse/dir.c          | 2 +-
 fs/gfs2/inode.c        | 6 +++---
 fs/namei.c             | 2 +-
 fs/nfs/dir.c           | 2 +-
 fs/open.c              | 3 +--
 include/linux/fs.h     | 3 +--
 10 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 566929792480..7b6ff3275d9c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -917,7 +917,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		v9inode->writeback_fid = (void *) inode_fid;
 	}
 	mutex_unlock(&v9inode->v_mutex);
-	err = finish_open(file, dentry, generic_file_open, opened);
+	err = finish_open(file, dentry, generic_file_open);
 	if (err)
 		goto error;
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index ee65db5c7eb0..c6939b7cb18c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -352,7 +352,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	}
 	mutex_unlock(&v9inode->v_mutex);
 	/* Since we are opening a file, assign the open fid to the file */
-	err = finish_open(file, dentry, generic_file_open, opened);
+	err = finish_open(file, dentry, generic_file_open);
 	if (err)
 		goto err_clunk_old_fid;
 	file->private_data = ofid;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 38a63fff7903..38b28cb2fac1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -509,7 +509,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			ceph_init_inode_acls(d_inode(dentry), &acls);
 			file->f_mode |= FMODE_CREATED;
 		}
-		err = finish_open(file, dentry, ceph_open, opened);
+		err = finish_open(file, dentry, ceph_open);
 	}
 out_req:
 	if (!req->r_err && req->r_target_inode)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 21d7e393900e..891bfd62e67a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -541,7 +541,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 		file->f_mode |= FMODE_CREATED;
 
-	rc = finish_open(file, direntry, generic_file_open, opened);
+	rc = finish_open(file, direntry, generic_file_open);
 	if (rc) {
 		if (server->ops->close)
 			server->ops->close(xid, tcon, &fid);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d4bdcf51e6cb..a5b1f5ff8cb7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -469,7 +469,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	d_instantiate(entry, inode);
 	fuse_change_entry_timeout(entry, &outentry);
 	fuse_invalidate_attr(dir);
-	err = finish_open(file, entry, generic_file_open, opened);
+	err = finish_open(file, entry, generic_file_open);
 	if (err) {
 		fuse_sync_release(ff, flags);
 	} else {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4aba00a6004b..59f695e96d63 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -626,7 +626,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		error = 0;
 		if (file) {
 			if (S_ISREG(inode->i_mode))
-				error = finish_open(file, dentry, gfs2_open_common, opened);
+				error = finish_open(file, dentry, gfs2_open_common);
 			else
 				error = finish_no_open(file, NULL);
 		}
@@ -768,7 +768,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	d_instantiate(dentry, inode);
 	if (file) {
 		file->f_mode |= FMODE_CREATED;
-		error = finish_open(file, dentry, gfs2_open_common, opened);
+		error = finish_open(file, dentry, gfs2_open_common);
 	}
 	gfs2_glock_dq_uninit(ghs);
 	gfs2_glock_dq_uninit(ghs + 1);
@@ -866,7 +866,7 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 		return d;
 	}
 	if (file && S_ISREG(inode->i_mode))
-		error = finish_open(file, dentry, gfs2_open_common, opened);
+		error = finish_open(file, dentry, gfs2_open_common);
 
 	gfs2_glock_dq_uninit(&gh);
 	if (error) {
diff --git a/fs/namei.c b/fs/namei.c
index d2aeb282ed05..117b118853f2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3475,7 +3475,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
-	error = finish_open(file, child, NULL, opened);
+	error = finish_open(file, child, NULL);
 out2:
 	mnt_drop_write(path.mnt);
 out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0ac50983fc4e..22176a3818d5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1439,7 +1439,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
 	int err;
 
-	err = finish_open(file, dentry, do_open, opened);
+	err = finish_open(file, dentry, do_open);
 	if (err)
 		goto out;
 	if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
diff --git a/fs/open.c b/fs/open.c
index d2030a3c5c52..dbaac9efc7fc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -843,8 +843,7 @@ cleanup_file:
  * Returns zero on success or -errno if the open failed.
  */
 int finish_open(struct file *file, struct dentry *dentry,
-		int (*open)(struct inode *, struct file *),
-		int *opened)
+		int (*open)(struct inode *, struct file *))
 {
 	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca668c7e48a7..70be3e4c26ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2441,8 +2441,7 @@ enum {
 	FILE_OPENED = 2
 };
 extern int finish_open(struct file *file, struct dentry *dentry,
-			int (*open)(struct inode *, struct file *),
-			int *opened);
+			int (*open)(struct inode *, struct file *));
 extern int finish_no_open(struct file *file, struct dentry *dentry);
 
 /* fs/ioctl.c */
-- 
cgit v1.2.3


From 44907d79002466049fdbb8ef15730d185e0808b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 13:32:02 -0400
Subject: get rid of 'opened' argument of ->atomic_open() - part 3

now it can be done...

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c      | 3 +--
 fs/9p/vfs_inode_dotl.c | 3 +--
 fs/bad_inode.c         | 2 +-
 fs/ceph/file.c         | 3 +--
 fs/ceph/super.h        | 3 +--
 fs/cifs/cifsfs.h       | 3 +--
 fs/cifs/dir.c          | 3 +--
 fs/fuse/dir.c          | 2 +-
 fs/gfs2/inode.c        | 3 +--
 fs/namei.c             | 3 +--
 fs/nfs/dir.c           | 2 +-
 fs/nfs/nfs4_fs.h       | 2 +-
 include/linux/fs.h     | 2 +-
 13 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7b6ff3275d9c..85ff859d3af5 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -859,8 +859,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 static int
 v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
-		     struct file *file, unsigned flags, umode_t mode,
-		     int *opened)
+		     struct file *file, unsigned flags, umode_t mode)
 {
 	int err;
 	u32 perm;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c6939b7cb18c..4823e1c46999 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -241,8 +241,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 
 static int
 v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
-			  struct file *file, unsigned flags, umode_t omode,
-			  int *opened)
+			  struct file *file, unsigned flags, umode_t omode)
 {
 	int err = 0;
 	kgid_t gid;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 125e8bbd22a2..8035d2a44561 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -134,7 +134,7 @@ static int bad_inode_update_time(struct inode *inode, struct timespec64 *time,
 
 static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
 				 struct file *file, unsigned int open_flag,
-				 umode_t create_mode, int *opened)
+				 umode_t create_mode)
 {
 	return -EIO;
 }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 38b28cb2fac1..e2679e8a2535 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -429,8 +429,7 @@ out:
  * file or symlink, return 1 so the VFS can retry.
  */
 int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
-		     struct file *file, unsigned flags, umode_t mode,
-		     int *opened)
+		     struct file *file, unsigned flags, umode_t mode)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a7077a0c989f..971328b99ede 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1025,8 +1025,7 @@ extern const struct file_operations ceph_file_fops;
 extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
-			    struct file *file, unsigned flags, umode_t mode,
-			    int *opened);
+			    struct file *file, unsigned flags, umode_t mode);
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 				  char *data, size_t len);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5f0231803431..f3a78efc3109 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -65,8 +65,7 @@ extern struct inode *cifs_root_iget(struct super_block *);
 extern int cifs_create(struct inode *, struct dentry *, umode_t,
 		       bool excl);
 extern int cifs_atomic_open(struct inode *, struct dentry *,
-			    struct file *, unsigned, umode_t,
-			    int *);
+			    struct file *, unsigned, umode_t);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 				  unsigned int);
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 891bfd62e67a..3713d22b95a7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -465,8 +465,7 @@ out_err:
 
 int
 cifs_atomic_open(struct inode *inode, struct dentry *direntry,
-		 struct file *file, unsigned oflags, umode_t mode,
-		 int *opened)
+		 struct file *file, unsigned oflags, umode_t mode)
 {
 	int rc;
 	unsigned int xid;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b8d7e9d423c8..c979329311c8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -489,7 +489,7 @@ out_err:
 static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 			    struct file *file, unsigned flags,
-			    umode_t mode, int *opened)
+			    umode_t mode)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 15e2a8a3b917..648f0ca1ad57 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1228,14 +1228,13 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
  * @file: The proposed new struct file
  * @flags: open flags
  * @mode: File mode
- * @opened: Flag to say whether the file has been opened or not
  *
  * Returns: error code or 0 for success
  */
 
 static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags,
-			    umode_t mode, int *opened)
+			    umode_t mode)
 {
 	struct dentry *d;
 	bool excl = !!(flags & O_EXCL);
diff --git a/fs/namei.c b/fs/namei.c
index 117b118853f2..1da272bf8ed3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3052,8 +3052,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	file->f_path.dentry = DENTRY_NOT_SET;
 	file->f_path.mnt = nd->path.mnt;
 	error = dir->i_op->atomic_open(dir, dentry, file,
-				       open_to_namei_flags(open_flag),
-				       mode, opened);
+				       open_to_namei_flags(open_flag), mode);
 	d_lookup_done(dentry);
 	if (!error) {
 		/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 71ae3cc3e53a..f447b1a24350 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1451,7 +1451,7 @@ out:
 
 int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		    struct file *file, unsigned open_flags,
-		    umode_t mode, int *opened)
+		    umode_t mode)
 {
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	struct nfs_open_context *ctx;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 137e18abb7e7..51beb6e38c90 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -258,7 +258,7 @@ extern const struct dentry_operations nfs4_dentry_operations;
 
 /* dir.c */
 int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
-		    unsigned, umode_t, int *);
+		    unsigned, umode_t);
 
 /* super.c */
 extern struct file_system_type nfs4_fs_type;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70be3e4c26ac..c25896b30e9f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1781,7 +1781,7 @@ struct inode_operations {
 	int (*update_time)(struct inode *, struct timespec64 *, int);
 	int (*atomic_open)(struct inode *, struct dentry *,
 			   struct file *, unsigned open_flag,
-			   umode_t create_mode, int *opened);
+			   umode_t create_mode);
 	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 } ____cacheline_aligned;
-- 
cgit v1.2.3


From dbae8f2ca2f0586f4b80201c78ff0aed2a012ab5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Jun 2018 17:51:47 -0400
Subject: kill FILE_{CREATED,OPENED}

no users left

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c25896b30e9f..bd904c496878 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2436,10 +2436,6 @@ extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
 extern void putname(struct filename *name);
 
-enum {
-	FILE_CREATED = 1,
-	FILE_OPENED = 2
-};
 extern int finish_open(struct file *file, struct dentry *dentry,
 			int (*open)(struct inode *, struct file *));
 extern int finish_no_open(struct file *file, struct dentry *dentry);
-- 
cgit v1.2.3


From d93aa9d82aea80b80f225dbf9c7986df444d8106 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 9 Jun 2018 09:40:05 -0400
Subject: new wrapper: alloc_file_pseudo()

takes inode, vfsmount, name, O_... flags and file_operations and
either returns a new struct file (in which case inode reference we
held is consumed) or returns ERR_PTR(), in which case no refcounts
are altered.

converted aio_private_file() and sock_alloc_file() to it

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c             | 20 ++++----------------
 fs/file_table.c      | 27 +++++++++++++++++++++++++++
 include/linux/file.h |  3 +++
 net/socket.c         | 28 +++++-----------------------
 4 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 9eea53887d6c..c3a8bac16374 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -215,9 +215,7 @@ static const struct address_space_operations aio_ctx_aops;
 
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
-	struct qstr this = QSTR_INIT("[aio]", 5);
 	struct file *file;
-	struct path path;
 	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -226,27 +224,17 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 	inode->i_mapping->private_data = ctx;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
-	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
-	if (!path.dentry) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	path.mnt = mntget(aio_mnt);
-
-	d_instantiate(path.dentry, inode);
-	file = alloc_file(&path, O_RDWR, &aio_ring_fops);
+	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
+				O_RDWR, &aio_ring_fops);
 	if (IS_ERR(file))
-		path_put(&path);
+		iput(inode);
 	return file;
 }
 
 static struct dentry *aio_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	static const struct dentry_operations ops = {
-		.d_dname	= simple_dname,
-	};
-	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
+	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
 					   AIO_RING_MAGIC);
 
 	if (!IS_ERR(root))
diff --git a/fs/file_table.c b/fs/file_table.c
index 9b70ed2bbc4e..6b3723909342 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -184,6 +184,33 @@ struct file *alloc_file(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(alloc_file);
 
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+				const char *name, int flags,
+				const struct file_operations *fops)
+{
+	static const struct dentry_operations anon_ops = {
+		.d_dname = simple_dname
+	};
+	struct qstr this = QSTR_INIT(name, strlen(name));
+	struct path path;
+	struct file *file;
+
+	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+	if (!path.dentry)
+		return ERR_PTR(-ENOMEM);
+	if (!mnt->mnt_sb->s_d_op)
+		d_set_d_op(path.dentry, &anon_ops);
+	path.mnt = mntget(mnt);
+	d_instantiate(path.dentry, inode);
+	file = alloc_file(&path, flags, fops);
+	if (IS_ERR(file)) {
+		ihold(inode);
+		path_put(&path);
+	}
+	return file;
+}
+EXPORT_SYMBOL(alloc_file_pseudo);
+
 /* the real guts of fput() - releasing the last reference to file
  */
 static void __fput(struct file *file)
diff --git a/include/linux/file.h b/include/linux/file.h
index aed45d69811e..5b25388f2f79 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -17,9 +17,12 @@ extern void fput(struct file *);
 struct file_operations;
 struct vfsmount;
 struct dentry;
+struct inode;
 struct path;
 extern struct file *alloc_file(const struct path *, int flags,
 	const struct file_operations *fop);
+extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
+	const char *, int flags, const struct file_operations *);
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
diff --git a/net/socket.c b/net/socket.c
index 2cdbe8f71b7f..4cf3568caf9f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -391,33 +391,15 @@ static struct file_system_type sock_fs_type = {
 
 struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 {
-	struct qstr name = { .name = "" };
-	struct path path;
 	struct file *file;
 
-	if (dname) {
-		name.name = dname;
-		name.len = strlen(name.name);
-	} else if (sock->sk) {
-		name.name = sock->sk->sk_prot_creator->name;
-		name.len = strlen(name.name);
-	}
-	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
-	if (unlikely(!path.dentry)) {
-		sock_release(sock);
-		return ERR_PTR(-ENOMEM);
-	}
-	path.mnt = mntget(sock_mnt);
-
-	d_instantiate(path.dentry, SOCK_INODE(sock));
+	if (!dname)
+		dname = sock->sk ? sock->sk->sk_prot_creator->name : "";
 
-	file = alloc_file(&path, O_RDWR | (flags & O_NONBLOCK),
-		  &socket_file_ops);
+	file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
+				O_RDWR | (flags & O_NONBLOCK),
+				&socket_file_ops);
 	if (IS_ERR(file)) {
-		/* drop dentry, keep inode for a bit */
-		ihold(d_inode(path.dentry));
-		path_put(&path);
-		/* ... and now kill it properly */
 		sock_release(sock);
 		return file;
 	}
-- 
cgit v1.2.3


From 183266f26f45a47958afb5c9aa1b3d4651e2eb8c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 17 Jun 2018 14:15:10 -0400
Subject: new helper: alloc_file_clone()

alloc_file_clone(old_file, mode, ops): create a new struct file with
->f_path equal to that of old_file.  pipe converted.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c      | 11 +++++++++++
 fs/pipe.c            |  6 ++----
 include/linux/file.h |  2 ++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 6b3723909342..78b067ddb386 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -211,6 +211,17 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(alloc_file_pseudo);
 
+struct file *alloc_file_clone(struct file *base, int flags,
+				const struct file_operations *fops)
+{
+	struct file *f = alloc_file(&base->f_path, flags, fops);
+	if (!IS_ERR(f)) {
+		path_get(&f->f_path);
+		f->f_mapping = base->f_mapping;
+	}
+	return f;
+}
+
 /* the real guts of fput() - releasing the last reference to file
  */
 static void __fput(struct file *file)
diff --git a/fs/pipe.c b/fs/pipe.c
index 9701bd3458d1..5f50070774bc 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -761,15 +761,13 @@ int create_pipe_files(struct file **res, int flags)
 
 	f->private_data = inode->i_pipe;
 
-	res[0] = alloc_file(&f->f_path, O_RDONLY | (flags & O_NONBLOCK),
-			&pipefifo_fops);
+	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
+				  &pipefifo_fops);
 	if (IS_ERR(res[0])) {
 		put_pipe_info(inode, inode->i_pipe);
 		fput(f);
 		return PTR_ERR(res[0]);
 	}
-
-	path_get(&f->f_path);
 	res[0]->private_data = inode->i_pipe;
 	res[1] = f;
 	return 0;
diff --git a/include/linux/file.h b/include/linux/file.h
index 5b25388f2f79..60914843c737 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -23,6 +23,8 @@ extern struct file *alloc_file(const struct path *, int flags,
 	const struct file_operations *fop);
 extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
 	const char *, int flags, const struct file_operations *);
+extern struct file *alloc_file_clone(struct file *, int flags,
+	const struct file_operations *);
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
-- 
cgit v1.2.3


From ee1904ba44bd4a242b453e8fe179b374906da173 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 17 Jun 2018 14:21:27 -0400
Subject: make alloc_file() static

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c      | 3 +--
 include/linux/file.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 78b067ddb386..d6eccd04d703 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -157,7 +157,7 @@ over:
  * @flags: O_... flags with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-struct file *alloc_file(const struct path *path, int flags,
+static struct file *alloc_file(const struct path *path, int flags,
 		const struct file_operations *fop)
 {
 	struct file *file;
@@ -182,7 +182,6 @@ struct file *alloc_file(const struct path *path, int flags,
 		i_readcount_inc(path->dentry->d_inode);
 	return file;
 }
-EXPORT_SYMBOL(alloc_file);
 
 struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 				const char *name, int flags,
diff --git a/include/linux/file.h b/include/linux/file.h
index 60914843c737..6b2fb032416c 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -19,8 +19,6 @@ struct vfsmount;
 struct dentry;
 struct inode;
 struct path;
-extern struct file *alloc_file(const struct path *, int flags,
-	const struct file_operations *fop);
 extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
 	const char *, int flags, const struct file_operations *);
 extern struct file *alloc_file_clone(struct file *, int flags,
-- 
cgit v1.2.3


From a8802d97e73346bc81609df9dfba7d3306f40d87 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Jul 2018 11:16:41 -0700
Subject: ktime: Provide typesafe ktime_to_ns()

Using ktime_to_ns() is nice to help backports to stable kernels.

Having a typesafe function instead of a macro avoid stupid typos
and waste of time tracking these typos.

Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/20180711181641.10369-1-edumazet@google.com
---
 include/linux/ktime.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 5b9fddbaac41..b2bb44f87f5a 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -93,8 +93,11 @@ static inline ktime_t timeval_to_ktime(struct timeval tv)
 /* Map the ktime_t to timeval conversion to ns_to_timeval function */
 #define ktime_to_timeval(kt)		ns_to_timeval((kt))
 
-/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
-#define ktime_to_ns(kt)			(kt)
+/* Convert ktime_t to nanoseconds */
+static inline s64 ktime_to_ns(const ktime_t kt)
+{
+	return kt;
+}
 
 /**
  * ktime_compare - Compares two ktime_t variables for less, greater or equal
-- 
cgit v1.2.3


From cca9bab1b72cd2296097c75f59ef11ef80461279 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 11 Jul 2018 12:16:12 +0200
Subject: tcp: use monotonic timestamps for PAWS

Using get_seconds() for timestamps is deprecated since it can lead
to overflows on 32-bit systems. While the interface generally doesn't
overflow until year 2106, the specific implementation of the TCP PAWS
algorithm breaks in 2038 when the intermediate signed 32-bit timestamps
overflow.

A related problem is that the local timestamps in CLOCK_REALTIME form
lead to unexpected behavior when settimeofday is called to set the system
clock backwards or forwards by more than 24 days.

While the first problem could be solved by using an overflow-safe method
of comparing the timestamps, a nicer solution is to use a monotonic
clocksource with ktime_get_seconds() that simply doesn't overflow (at
least not until 136 years after boot) and that doesn't change during
settimeofday().

To make 32-bit and 64-bit architectures behave the same way here, and
also save a few bytes in the tcp_options_received structure, I'm changing
the type to a 32-bit integer, which is now safe on all architectures.

Finally, the ts_recent_stamp field also (confusingly) gets used to store
a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow().
This is currently safe, but changing the type to 32-bit requires
some small changes there to keep it working.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_cm.c |  2 +-
 include/linux/tcp.h                     |  4 ++--
 include/net/tcp.h                       | 17 ++++++++++-------
 net/ipv4/tcp_input.c                    |  2 +-
 net/ipv4/tcp_ipv4.c                     |  3 ++-
 net/ipv4/tcp_minisocks.c                |  8 ++++----
 6 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index 2bb6f0380758..0997e166ea57 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -1673,7 +1673,7 @@ static void chtls_timewait(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->rcv_nxt++;
-	tp->rx_opt.ts_recent_stamp = get_seconds();
+	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 	tp->srtt_us = 0;
 	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 }
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3dbea6610304..58a8d7d71354 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -89,7 +89,7 @@ struct tcp_sack_block {
 
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
-	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
+	int	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
 	u32	ts_recent;	/* Time stamp to echo next		*/
 	u32	rcv_tsval;	/* Time stamp value             	*/
 	u32	rcv_tsecr;	/* Time stamp echo reply        	*/
@@ -426,7 +426,7 @@ struct tcp_timewait_sock {
 	/* The time we sent the last out-of-window ACK: */
 	u32			  tw_last_oow_ack_time;
 
-	long			  tw_ts_recent_stamp;
+	int			  tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6cb20e6e524..582304955087 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -472,19 +472,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
  */
 static inline void tcp_synq_overflow(const struct sock *sk)
 {
-	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-	unsigned long now = jiffies;
+	unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int now = jiffies;
 
-	if (time_after(now, last_overflow + HZ))
+	if (time_after32(now, last_overflow + HZ))
 		tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
 }
 
 /* syncookies: no recent synqueue overflow on this listening socket? */
 static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
 {
-	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int now = jiffies;
 
-	return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
+	return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID);
 }
 
 static inline u32 tcp_cookie_time(void)
@@ -1375,7 +1376,8 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
 {
 	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
 		return true;
-	if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))
+	if (unlikely(!time_before32(ktime_get_seconds(),
+				    rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
 		return true;
 	/*
 	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
@@ -1405,7 +1407,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
 
 	   However, we can relax time bounds for RST segments to MSL.
 	 */
-	if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
+	if (rst && !time_before32(ktime_get_seconds(),
+				  rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
 		return false;
 	return true;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 814ea43dd12f..d3b6390ecf23 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3462,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 static void tcp_store_ts_recent(struct tcp_sock *tp)
 {
 	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
-	tp->rx_opt.ts_recent_stamp = get_seconds();
+	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 }
 
 static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bea17f1e8302..dc415c66a33a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	   and use initial timestamp retrieved from peer table.
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
-	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+	    (!twp || (reuse && time_after32(ktime_get_seconds(),
+					    tcptw->tw_ts_recent_stamp)))) {
 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 		if (tp->write_seq == 0)
 			tp->write_seq = 1;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dac5893a52b4..75ef332a7caf 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		tw->tw_substate	  = TCP_TIME_WAIT;
 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
-			tcptw->tw_ts_recent_stamp = get_seconds();
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
@@ -189,7 +189,7 @@ kill:
 
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
-			tcptw->tw_ts_recent_stamp = get_seconds();
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
 		}
 
 		inet_twsk_put(tw);
@@ -537,7 +537,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 	if (newtp->rx_opt.tstamp_ok) {
 		newtp->rx_opt.ts_recent = req->ts_recent;
-		newtp->rx_opt.ts_recent_stamp = get_seconds();
+		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 	} else {
 		newtp->rx_opt.ts_recent_stamp = 0;
@@ -603,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			 * it can be estimated (approximately)
 			 * from another data.
 			 */
-			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+			tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
-- 
cgit v1.2.3


From 3949fa9bac090ad217534c30bc3b6572289abf21 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 May 2018 15:29:10 -0700
Subject: rcu: Make rcu_read_unlock_special() static

Because rcu_read_unlock_special() is no longer used outside of
kernel/rcu/tree_plugin.h, this commit makes it static.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 1 -
 kernel/rcu/tree_plugin.h | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 65163aa0bb04..67ec077c7ee5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -64,7 +64,6 @@ void rcu_barrier_tasks(void);
 
 void __rcu_read_lock(void);
 void __rcu_read_unlock(void);
-void rcu_read_unlock_special(struct task_struct *t);
 void synchronize_rcu(void);
 
 /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 613372246a07..54a251640f53 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -127,6 +127,7 @@ static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
 
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake);
+static void rcu_read_unlock_special(struct task_struct *t);
 
 /*
  * Tell them what RCU they are running.
@@ -461,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
-void rcu_read_unlock_special(struct task_struct *t)
+static void rcu_read_unlock_special(struct task_struct *t)
 {
 	bool empty_exp;
 	bool empty_norm;
-- 
cgit v1.2.3


From 07f27570dcd148a5f4de7dc3513c1d1cd069b362 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 11 May 2018 17:30:34 +0900
Subject: rcu: Improve rcu_note_voluntary_context_switch() reporting

We expect a quiescent state of TASKS_RCU when cond_resched_tasks_rcu_qs()
is called, no matter whether it actually be scheduled or not. However,
it currently doesn't report the quiescent state when the task enters
into __schedule() as it's called with preempt = true. So make it report
the quiescent state unconditionally when cond_resched_tasks_rcu_qs() is
called.

And in TINY_RCU, even though the quiescent state of rcu_bh also should
be reported when the tick interrupt comes from user, it doesn't. So make
it reported.

Lastly in TREE_RCU, rcu_note_voluntary_context_switch() should be
reported when the tick interrupt comes from not only user but also idle,
as an extended quiescent state.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Simplify rcutiny portion given no RCU-tasks for !PREEMPT. ]
---
 include/linux/rcupdate.h | 4 ++--
 kernel/rcu/tiny.c        | 4 +---
 kernel/rcu/tree.c        | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 67ec077c7ee5..5cab15e7ec83 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -194,8 +194,8 @@ static inline void exit_tasks_rcu_finish(void) { }
  */
 #define cond_resched_tasks_rcu_qs() \
 do { \
-	if (!cond_resched()) \
-		rcu_note_voluntary_context_switch_lite(current); \
+	rcu_note_voluntary_context_switch_lite(current); \
+	cond_resched(); \
 } while (0)
 
 /*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index a64eee0db39e..befc9321a89c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -122,10 +122,8 @@ void rcu_check_callbacks(int user)
 {
 	if (user)
 		rcu_sched_qs();
-	else if (!in_softirq())
+	if (user || !in_softirq())
 		rcu_bh_qs();
-	if (user)
-		rcu_note_voluntary_context_switch(current);
 }
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d3333ee2c6f5..19beabe73629 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2645,6 +2645,7 @@ void rcu_check_callbacks(int user)
 
 		rcu_sched_qs();
 		rcu_bh_qs();
+		rcu_note_voluntary_context_switch(current);
 
 	} else if (!in_softirq()) {
 
@@ -2660,8 +2661,7 @@ void rcu_check_callbacks(int user)
 	rcu_preempt_check_callbacks();
 	if (rcu_pending())
 		invoke_rcu_core();
-	if (user)
-		rcu_note_voluntary_context_switch(current);
+
 	trace_rcu_utilization(TPS("End scheduler-tick"));
 }
 
-- 
cgit v1.2.3


From 1445e9175bead409bb9930f3c745246a09f22cf6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 7 May 2018 06:35:46 -0300
Subject: rcu: rcupdate.h: Get rid of Sphinx warnings at rcu_pointer_handoff()

The code example at rcupdate.h currently produce lots of warnings:

	./include/linux/rcupdate.h:572: WARNING: Unexpected indentation.
	./include/linux/rcupdate.h:576: WARNING: Unexpected indentation.
	./include/linux/rcupdate.h:580: WARNING: Block quote ends without a blank line; unexpected unindent.
	./include/linux/rcupdate.h:582: WARNING: Block quote ends without a blank line; unexpected unindent.
	./include/linux/rcupdate.h:582: WARNING: Inline literal start-string without end-string.

This commit therefore changes it to a code-block.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5cab15e7ec83..dacc90358b33 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -566,8 +566,8 @@ static inline void rcu_preempt_sleep_check(void) { }
  * This is simply an identity function, but it documents where a pointer
  * is handed off from RCU to some other synchronization mechanism, for
  * example, reference counting or locking.  In C11, it would map to
- * kill_dependency().  It could be used as follows:
- * ``
+ * kill_dependency().  It could be used as follows::
+ *
  *	rcu_read_lock();
  *	p = rcu_dereference(gp);
  *	long_lived = is_long_lived(p);
@@ -578,7 +578,6 @@ static inline void rcu_preempt_sleep_check(void) { }
  *			p = rcu_pointer_handoff(p);
  *	}
  *	rcu_read_unlock();
- *``
  */
 #define rcu_pointer_handoff(p) (p)
 
-- 
cgit v1.2.3


From 6f56f714db067056c80f5d71510118f82872e34c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 14 May 2018 13:52:27 -0700
Subject: rcu: Improve RCU-tasks naming and comments

The naming and comments associated with some RCU-tasks code make
the faulty assumption that context switches due to cond_resched()
are voluntary.  As several people pointed out, this is not the case.
This commit therefore updates function names and comments to better
reflect current reality.

Reported-by: Byungchul Park <byungchul.park@lge.com>
Reported-by: Joel Fernandes <joel@joelfernandes.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 12 ++++++------
 include/linux/rcutiny.h  |  2 +-
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/update.c      | 27 ++++++++++++++-------------
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index dacc90358b33..75e5b393cf44 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -158,11 +158,11 @@ static inline void rcu_init_nohz(void) { }
 	} while (0)
 
 /*
- * Note a voluntary context switch for RCU-tasks benefit.  This is a
- * macro rather than an inline function to avoid #include hell.
+ * Note a quasi-voluntary context switch for RCU-tasks's benefit.
+ * This is a macro rather than an inline function to avoid #include hell.
  */
 #ifdef CONFIG_TASKS_RCU
-#define rcu_note_voluntary_context_switch_lite(t) \
+#define rcu_tasks_qs(t) \
 	do { \
 		if (READ_ONCE((t)->rcu_tasks_holdout)) \
 			WRITE_ONCE((t)->rcu_tasks_holdout, false); \
@@ -170,14 +170,14 @@ static inline void rcu_init_nohz(void) { }
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
 		rcu_all_qs(); \
-		rcu_note_voluntary_context_switch_lite(t); \
+		rcu_tasks_qs(t); \
 	} while (0)
 void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
 void exit_tasks_rcu_start(void);
 void exit_tasks_rcu_finish(void);
 #else /* #ifdef CONFIG_TASKS_RCU */
-#define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
+#define rcu_tasks_qs(t)	do { } while (0)
 #define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
 #define call_rcu_tasks call_rcu_sched
 #define synchronize_rcu_tasks synchronize_sched
@@ -194,7 +194,7 @@ static inline void exit_tasks_rcu_finish(void) { }
  */
 #define cond_resched_tasks_rcu_qs() \
 do { \
-	rcu_note_voluntary_context_switch_lite(current); \
+	rcu_tasks_qs(current); \
 	cond_resched(); \
 } while (0)
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7b3c82e8a625..8d9a0ea8f0b5 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -93,7 +93,7 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 #define rcu_note_context_switch(preempt) \
 	do { \
 		rcu_sched_qs(); \
-		rcu_note_voluntary_context_switch_lite(current); \
+		rcu_tasks_qs(current); \
 	} while (0)
 
 static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6f2922168216..ccc061acf887 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -457,7 +457,7 @@ void rcu_note_context_switch(bool preempt)
 		rcu_momentary_dyntick_idle();
 	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
 	if (!preempt)
-		rcu_note_voluntary_context_switch_lite(current);
+		rcu_tasks_qs(current);
 out:
 	trace_rcu_utilization(TPS("End context switch"));
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c230a60ece4..5783bdf86e5a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -507,14 +507,15 @@ early_initcall(check_cpu_stall_init);
 #ifdef CONFIG_TASKS_RCU
 
 /*
- * Simple variant of RCU whose quiescent states are voluntary context switch,
- * user-space execution, and idle.  As such, grace periods can take one good
- * long time.  There are no read-side primitives similar to rcu_read_lock()
- * and rcu_read_unlock() because this implementation is intended to get
- * the system into a safe state for some of the manipulations involved in
- * tracing and the like.  Finally, this implementation does not support
- * high call_rcu_tasks() rates from multiple CPUs.  If this is required,
- * per-CPU callback lists will be needed.
+ * Simple variant of RCU whose quiescent states are voluntary context
+ * switch, cond_resched_rcu_qs(), user-space execution, and idle.
+ * As such, grace periods can take one good long time.  There are no
+ * read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
+ * because this implementation is intended to get the system into a safe
+ * state for some of the manipulations involved in tracing and the like.
+ * Finally, this implementation does not support high call_rcu_tasks()
+ * rates from multiple CPUs.  If this is required, per-CPU callback lists
+ * will be needed.
  */
 
 /* Global list of callbacks and associated lock. */
@@ -542,11 +543,11 @@ static struct task_struct *rcu_tasks_kthread_ptr;
  * period elapses, in other words after all currently executing RCU
  * read-side critical sections have completed. call_rcu_tasks() assumes
  * that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), entry into idle, or transition to usermode
- * execution.  As such, there are no read-side primitives analogous to
- * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
- * to determine that all tasks have passed through a safe state, not so
- * much for data-strcuture synchronization.
+ * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * or transition to usermode execution.  As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-strcuture synchronization.
  *
  * See the description of call_rcu() for more detailed information on
  * memory ordering guarantees.
-- 
cgit v1.2.3


From b7b6f94cf6e6d961f78064315b6f5de5d9c6414b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 18 Jun 2018 14:22:40 +1000
Subject: rculist: Improve documentation for list_for_each_entry_from_rcu()

Unfortunately the patch for adding list_for_each_entry_from_rcu()
wasn't the final patch after all review.  It is functionally
correct but the documentation was incomplete.

This patch adds this missing documentation which includes an update to
the documentation for list_for_each_entry_continue_rcu() to match the
documentation for the new list_for_each_entry_from_rcu(), and adds
list_for_each_entry_from_rcu() and the already existing
hlist_for_each_entry_from_rcu() to section 7 of whatisRCU.txt.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/whatisRCU.txt |  2 ++
 include/linux/rculist.h         | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 94288f1b8759..c2a7facf7ff9 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -820,11 +820,13 @@ RCU list traversal:
 	list_next_rcu
 	list_for_each_entry_rcu
 	list_for_each_entry_continue_rcu
+	list_for_each_entry_from_rcu
 	hlist_first_rcu
 	hlist_next_rcu
 	hlist_pprev_rcu
 	hlist_for_each_entry_rcu
 	hlist_for_each_entry_rcu_bh
+	hlist_for_each_entry_from_rcu
 	hlist_for_each_entry_continue_rcu
 	hlist_for_each_entry_continue_rcu_bh
 	hlist_nulls_first_rcu
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 36df6ccbc874..4786c2235b98 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -396,7 +396,16 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * @member:	the name of the list_head within the struct.
  *
  * Continue to iterate over list of given type, continuing after
- * the current position.
+ * the current position which must have been in the list when the RCU read
+ * lock was taken.
+ * This would typically require either that you obtained the node from a
+ * previous walk of the list in the same RCU read-side critical section, or
+ * that you held some sort of non-RCU reference (such as a reference count)
+ * to keep the node alive *and* in the list.
+ *
+ * This iterator is similar to list_for_each_entry_from_rcu() except
+ * this starts after the given position and that one starts at the given
+ * position.
  */
 #define list_for_each_entry_continue_rcu(pos, head, member) 		\
 	for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
@@ -411,6 +420,14 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  *
  * Iterate over the tail of a list starting from a given position,
  * which must have been in the list when the RCU read lock was taken.
+ * This would typically require either that you obtained the node from a
+ * previous walk of the list in the same RCU read-side critical section, or
+ * that you held some sort of non-RCU reference (such as a reference count)
+ * to keep the node alive *and* in the list.
+ *
+ * This iterator is similar to list_for_each_entry_continue_rcu() except
+ * this starts from the given position and that one starts from the position
+ * after the given position.
  */
 #define list_for_each_entry_from_rcu(pos, head, member)			\
 	for (; &(pos)->member != (head);					\
-- 
cgit v1.2.3


From 3025520ec424df8b0fd5cdc319ad6b83406d9954 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 22 May 2018 11:38:47 -0700
Subject: rcutorture: Use per-CPU random state for rcu_torture_timer()

Currently, the rcu_torture_timer() function uses a single global
torture_random_state structure protected by a single global lock.
This conflicts to some extent with performance and scalability,
but even more with the goal of consolidating read-side testing
with rcu_torture_reader().  This commit therefore creates a per-CPU
torture_random_state structure for use by rcu_torture_timer() and
eliminates the lock.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Make rcu_torture_timer_rand static, per 0day Test Robot report. ]
---
 include/linux/torture.h |  2 ++
 kernel/rcu/rcutorture.c | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index a55e80817dae..61dfd93b6ee4 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -64,6 +64,8 @@ struct torture_random_state {
 	long trs_count;
 };
 #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 }
+#define DEFINE_TORTURE_RANDOM_PERCPU(name) \
+	DEFINE_PER_CPU(struct torture_random_state, name)
 unsigned long torture_random(struct torture_random_state *trsp);
 
 /* Task shuffler, which causes CPUs to occasionally go idle. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2452e4a29923..d5a5465d2507 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1143,6 +1143,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
 	return true;
 }
 
+static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
+
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -1154,12 +1156,12 @@ static void rcu_torture_timer(struct timer_list *unused)
 	int idx;
 	unsigned long started;
 	unsigned long completed;
-	static DEFINE_TORTURE_RANDOM(rand);
-	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
 	int pipe_count;
+	struct torture_random_state *trsp;
 	unsigned long long ts;
 
+	trsp = this_cpu_ptr(&rcu_torture_timer_rand);
 	atomic_long_inc(&n_rcu_torture_timers);
 	idx = cur_ops->readlock();
 	started = cur_ops->get_gp_seq();
@@ -1176,9 +1178,7 @@ static void rcu_torture_timer(struct timer_list *unused)
 	}
 	if (p->rtort_mbtest == 0)
 		atomic_inc(&n_rcu_torture_mberror);
-	spin_lock(&rand_lock);
-	cur_ops->read_delay(&rand);
-	spin_unlock(&rand_lock);
+	cur_ops->read_delay(trsp);
 	preempt_disable();
 	pipe_count = p->rtort_pipe_count;
 	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
-- 
cgit v1.2.3


From 523f9eb1ef25aab4aaf9aeb5356160e8039411ef Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:15 +0300
Subject: net/mlx4_core: Add health buffer address capability

Health buffer address is a 32 bit PCI address offset provided by
the FW. This offset is used for reading FW health debug data
located on the shared CR space. Cr space is accessible in both
driver and FW and allows for different queries and configurations.
Health buffer size is always 64B of readable data followed by a
lock which is used to block volatile CR space access.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 5 ++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h   | 1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 1 +
 include/linux/mlx4/device.h               | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 46dcbfbe4c5e..babcfd9c0571 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -825,7 +825,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET	0xcc
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET	0xd0
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET	0xd2
-
+#define QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET	0xe4
 
 	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -1082,6 +1082,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->rl_caps.min_unit = size >> 14;
 	}
 
+	MLX4_GET(dev_cap->health_buffer_addrs, outbox,
+		 QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET);
+
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 16))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index cd6399c76bfd..650ae08c71de 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -128,6 +128,7 @@ struct mlx4_dev_cap {
 	u32 dmfs_high_rate_qpn_base;
 	u32 dmfs_high_rate_qpn_range;
 	struct mlx4_rate_limit_caps rl_caps;
+	u32 health_buffer_addrs;
 	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 	bool wol_port[MLX4_MAX_PORTS + 1];
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index c42eddfcd2b5..806d441b3701 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -523,6 +523,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 	dev->caps.wol_port[1]          = dev_cap->wol_port[1];
 	dev->caps.wol_port[2]          = dev_cap->wol_port[2];
+	dev->caps.health_buffer_addrs  = dev_cap->health_buffer_addrs;
 
 	/* Save uar page shift */
 	if (!mlx4_is_slave(dev)) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 122e7e9d3091..e3bfe76aea98 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -630,6 +630,7 @@ struct mlx4_caps {
 	u32			vf_caps;
 	bool			wol_port[MLX4_MAX_PORTS + 1];
 	struct mlx4_rate_limit_caps rl_caps;
+	u32			health_buffer_addrs;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From bedc989b0c98285b277ff8a08ff9514e580913f4 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:16 +0300
Subject: net/mlx4_core: Add Crdump FW snapshot support

Crdump allows the driver to create a snapshot of the FW PCI
crspace and health buffer during a critical FW issue.
In case of a FW command timeout, FW getting stuck or a non zero
value on the catastrophic buffer, a snapshot will be taken.

The snapshot is exposed using devlink, cr-space, fw-health
address regions are registered on init and snapshots are attached
once a new snapshot is collected by the driver.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/Makefile |   2 +-
 drivers/net/ethernet/mellanox/mlx4/catas.c  |   6 +-
 drivers/net/ethernet/mellanox/mlx4/crdump.c | 231 ++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/main.c   |  10 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h   |   4 +
 include/linux/mlx4/device.h                 |   6 +
 6 files changed, 255 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/crdump.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile
index 16b10d01fcf4..3f400770fcd8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx4/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o fw_qos.o icm.o intf.o \
 		main.o mcg.o mr.o pd.o port.o profile.o qp.o reset.o sense.o \
-		srq.o resource_tracker.o
+		srq.o resource_tracker.o crdump.o
 
 obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 8afe4b5fb09b..c81d15bf259c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -178,10 +178,12 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 
 	dev = persist->dev;
 	mlx4_err(dev, "device is going to be reset\n");
-	if (mlx4_is_slave(dev))
+	if (mlx4_is_slave(dev)) {
 		err = mlx4_reset_slave(dev);
-	else
+	} else {
+		mlx4_crdump_collect(dev);
 		err = mlx4_reset_master(dev);
+	}
 
 	if (!err) {
 		mlx4_err(dev, "device was reset successfully\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c
new file mode 100644
index 000000000000..4d5524dffec4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+#define BAD_ACCESS			0xBADACCE5
+#define HEALTH_BUFFER_SIZE		0x40
+#define CR_ENABLE_BIT			swab32(BIT(6))
+#define CR_ENABLE_BIT_OFFSET		0xF3F04
+#define MAX_NUM_OF_DUMPS_TO_STORE	(8)
+
+static const char *region_cr_space_str = "cr-space";
+static const char *region_fw_health_str = "fw-health";
+
+/* Set to true in case cr enable bit was set to true before crdump */
+static bool crdump_enbale_bit_set;
+
+static void crdump_enable_crspace_access(struct mlx4_dev *dev,
+					 u8 __iomem *cr_space)
+{
+	/* Get current enable bit value */
+	crdump_enbale_bit_set =
+		readl(cr_space + CR_ENABLE_BIT_OFFSET) & CR_ENABLE_BIT;
+
+	/* Enable FW CR filter (set bit6 to 0) */
+	if (crdump_enbale_bit_set)
+		writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) & ~CR_ENABLE_BIT,
+		       cr_space + CR_ENABLE_BIT_OFFSET);
+
+	/* Enable block volatile crspace accesses */
+	writel(swab32(1), cr_space + dev->caps.health_buffer_addrs +
+	       HEALTH_BUFFER_SIZE);
+}
+
+static void crdump_disable_crspace_access(struct mlx4_dev *dev,
+					  u8 __iomem *cr_space)
+{
+	/* Disable block volatile crspace accesses */
+	writel(0, cr_space + dev->caps.health_buffer_addrs +
+	       HEALTH_BUFFER_SIZE);
+
+	/* Restore FW CR filter value (set bit6 to original value) */
+	if (crdump_enbale_bit_set)
+		writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) | CR_ENABLE_BIT,
+		       cr_space + CR_ENABLE_BIT_OFFSET);
+}
+
+static void mlx4_crdump_collect_crspace(struct mlx4_dev *dev,
+					u8 __iomem *cr_space,
+					u32 id)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	struct pci_dev *pdev = dev->persist->pdev;
+	unsigned long cr_res_size;
+	u8 *crspace_data;
+	int offset;
+	int err;
+
+	if (!crdump->region_crspace) {
+		mlx4_err(dev, "crdump: cr-space region is NULL\n");
+		return;
+	}
+
+	/* Try to collect CR space */
+	cr_res_size = pci_resource_len(pdev, 0);
+	crspace_data = kvmalloc(cr_res_size, GFP_KERNEL);
+	if (crspace_data) {
+		for (offset = 0; offset < cr_res_size; offset += 4)
+			*(u32 *)(crspace_data + offset) =
+					readl(cr_space + offset);
+
+		err = devlink_region_snapshot_create(crdump->region_crspace,
+						     cr_res_size, crspace_data,
+						     id, &kvfree);
+		if (err) {
+			kvfree(crspace_data);
+			mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n",
+				  region_cr_space_str, id, err);
+		} else {
+			mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n",
+				  id, region_cr_space_str);
+		}
+	} else {
+		mlx4_err(dev, "crdump: Failed to allocate crspace buffer\n");
+	}
+}
+
+static void mlx4_crdump_collect_fw_health(struct mlx4_dev *dev,
+					  u8 __iomem *cr_space,
+					  u32 id)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	u8 *health_data;
+	int offset;
+	int err;
+
+	if (!crdump->region_fw_health) {
+		mlx4_err(dev, "crdump: fw-health region is NULL\n");
+		return;
+	}
+
+	/* Try to collect health buffer */
+	health_data = kvmalloc(HEALTH_BUFFER_SIZE, GFP_KERNEL);
+	if (health_data) {
+		u8 __iomem *health_buf_start =
+				cr_space + dev->caps.health_buffer_addrs;
+
+		for (offset = 0; offset < HEALTH_BUFFER_SIZE; offset += 4)
+			*(u32 *)(health_data + offset) =
+					readl(health_buf_start + offset);
+
+		err = devlink_region_snapshot_create(crdump->region_fw_health,
+						     HEALTH_BUFFER_SIZE,
+						     health_data,
+						     id, &kvfree);
+		if (err) {
+			kvfree(health_data);
+			mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n",
+				  region_fw_health_str, id, err);
+		} else {
+			mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n",
+				  id, region_fw_health_str);
+		}
+	} else {
+		mlx4_err(dev, "crdump: Failed to allocate health buffer\n");
+	}
+}
+
+int mlx4_crdump_collect(struct mlx4_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct pci_dev *pdev = dev->persist->pdev;
+	unsigned long cr_res_size;
+	u8 __iomem *cr_space;
+	u32 id;
+
+	if (!dev->caps.health_buffer_addrs) {
+		mlx4_info(dev, "crdump: FW doesn't support health buffer access, skipping\n");
+		return 0;
+	}
+
+	cr_res_size = pci_resource_len(pdev, 0);
+
+	cr_space = ioremap(pci_resource_start(pdev, 0), cr_res_size);
+	if (!cr_space) {
+		mlx4_err(dev, "crdump: Failed to map pci cr region\n");
+		return -ENODEV;
+	}
+
+	crdump_enable_crspace_access(dev, cr_space);
+
+	/* Get the available snapshot ID for the dumps */
+	id = devlink_region_shapshot_id_get(devlink);
+
+	/* Try to capture dumps */
+	mlx4_crdump_collect_crspace(dev, cr_space, id);
+	mlx4_crdump_collect_fw_health(dev, cr_space, id);
+
+	crdump_disable_crspace_access(dev, cr_space);
+
+	iounmap(cr_space);
+	return 0;
+}
+
+int mlx4_crdump_init(struct mlx4_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	struct pci_dev *pdev = dev->persist->pdev;
+
+	/* Create cr-space region */
+	crdump->region_crspace =
+		devlink_region_create(devlink,
+				      region_cr_space_str,
+				      MAX_NUM_OF_DUMPS_TO_STORE,
+				      pci_resource_len(pdev, 0));
+	if (IS_ERR(crdump->region_crspace))
+		mlx4_warn(dev, "crdump: create devlink region %s err %ld\n",
+			  region_cr_space_str,
+			  PTR_ERR(crdump->region_crspace));
+
+	/* Create fw-health region */
+	crdump->region_fw_health =
+		devlink_region_create(devlink,
+				      region_fw_health_str,
+				      MAX_NUM_OF_DUMPS_TO_STORE,
+				      HEALTH_BUFFER_SIZE);
+	if (IS_ERR(crdump->region_fw_health))
+		mlx4_warn(dev, "crdump: create devlink region %s err %ld\n",
+			  region_fw_health_str,
+			  PTR_ERR(crdump->region_fw_health));
+
+	return 0;
+}
+
+void mlx4_crdump_end(struct mlx4_dev *dev)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+
+	devlink_region_destroy(crdump->region_fw_health);
+	devlink_region_destroy(crdump->region_crspace);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 806d441b3701..46b021409b8b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3807,10 +3807,14 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 		}
 	}
 
-	err = mlx4_catas_init(&priv->dev);
+	err = mlx4_crdump_init(&priv->dev);
 	if (err)
 		goto err_release_regions;
 
+	err = mlx4_catas_init(&priv->dev);
+	if (err)
+		goto err_crdump;
+
 	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
 	if (err)
 		goto err_catas;
@@ -3820,6 +3824,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 err_catas:
 	mlx4_catas_end(&priv->dev);
 
+err_crdump:
+	mlx4_crdump_end(&priv->dev);
+
 err_release_regions:
 	pci_release_regions(pdev);
 
@@ -4081,6 +4088,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	else
 		mlx4_info(dev, "%s: interface is down\n", __func__);
 	mlx4_catas_end(dev);
+	mlx4_crdump_end(dev);
 	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
 		mlx4_warn(dev, "Disabling SR-IOV\n");
 		pci_disable_sriov(pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 2ebaa3bee42f..6e016092a6f1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1042,6 +1042,8 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
 int mlx4_catas_init(struct mlx4_dev *dev);
 void mlx4_catas_end(struct mlx4_dev *dev);
+int mlx4_crdump_init(struct mlx4_dev *dev);
+void mlx4_crdump_end(struct mlx4_dev *dev);
 int mlx4_restart_one(struct pci_dev *pdev, bool reload,
 		     struct devlink *devlink);
 int mlx4_register_device(struct mlx4_dev *dev);
@@ -1228,6 +1230,8 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
 int mlx4_comm_internal_err(u32 slave_read);
 
+int mlx4_crdump_collect(struct mlx4_dev *dev);
+
 int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
 		    enum mlx4_port_type *type);
 void mlx4_do_sense_ports(struct mlx4_dev *dev,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e3bfe76aea98..300b944e6e1e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -852,6 +852,11 @@ struct mlx4_vf_dev {
 	u8			n_ports;
 };
 
+struct mlx4_fw_crdump {
+	struct devlink_region *region_crspace;
+	struct devlink_region *region_fw_health;
+};
+
 enum mlx4_pci_status {
 	MLX4_PCI_STATUS_DISABLED,
 	MLX4_PCI_STATUS_ENABLED,
@@ -872,6 +877,7 @@ struct mlx4_dev_persistent {
 	u8	interface_state;
 	struct mutex		pci_status_mutex; /* sync pci state */
 	enum mlx4_pci_status	pci_status;
+	struct mlx4_fw_crdump	crdump;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From 3c641ba4a852cf4e90e3d7f29c5df08e24213c5d Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 12 Jul 2018 15:13:18 +0300
Subject: net/mlx4_core: Use devlink region_snapshot parameter

This parameter enables capturing region snapshot of the crspace
during critical errors. The default value of this parameter is
disabled, it can be enabled using devlink param commands.
It is possible to configure during runtime and also driver init.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/crdump.c |  8 ++++++
 drivers/net/ethernet/mellanox/mlx4/main.c   | 41 +++++++++++++++++++++++++++++
 include/linux/mlx4/device.h                 |  1 +
 3 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c
index 4d5524dffec4..88316c743820 100644
--- a/drivers/net/ethernet/mellanox/mlx4/crdump.c
+++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c
@@ -158,6 +158,7 @@ static void mlx4_crdump_collect_fw_health(struct mlx4_dev *dev,
 int mlx4_crdump_collect(struct mlx4_dev *dev)
 {
 	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
 	struct pci_dev *pdev = dev->persist->pdev;
 	unsigned long cr_res_size;
 	u8 __iomem *cr_space;
@@ -168,6 +169,11 @@ int mlx4_crdump_collect(struct mlx4_dev *dev)
 		return 0;
 	}
 
+	if (!crdump->snapshot_enable) {
+		mlx4_info(dev, "crdump: devlink snapshot disabled, skipping\n");
+		return 0;
+	}
+
 	cr_res_size = pci_resource_len(pdev, 0);
 
 	cr_space = ioremap(pci_resource_start(pdev, 0), cr_res_size);
@@ -197,6 +203,8 @@ int mlx4_crdump_init(struct mlx4_dev *dev)
 	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
 	struct pci_dev *pdev = dev->persist->pdev;
 
+	crdump->snapshot_enable = false;
+
 	/* Create cr-space region */
 	crdump->region_crspace =
 		devlink_region_create(devlink,
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 46b021409b8b..2d979a652b7b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -191,6 +191,26 @@ static int mlx4_devlink_ierr_reset_set(struct devlink *devlink, u32 id,
 	return 0;
 }
 
+static int mlx4_devlink_crdump_snapshot_get(struct devlink *devlink, u32 id,
+					    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+
+	ctx->val.vbool = dev->persist->crdump.snapshot_enable;
+	return 0;
+}
+
+static int mlx4_devlink_crdump_snapshot_set(struct devlink *devlink, u32 id,
+					    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+
+	dev->persist->crdump.snapshot_enable = ctx->val.vbool;
+	return 0;
+}
+
 static int
 mlx4_devlink_max_macs_validate(struct devlink *devlink, u32 id,
 			       union devlink_param_value val,
@@ -224,6 +244,11 @@ static const struct devlink_param mlx4_devlink_params[] = {
 	DEVLINK_PARAM_GENERIC(MAX_MACS,
 			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
 			      NULL, NULL, mlx4_devlink_max_macs_validate),
+	DEVLINK_PARAM_GENERIC(REGION_SNAPSHOT,
+			      BIT(DEVLINK_PARAM_CMODE_RUNTIME) |
+			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      mlx4_devlink_crdump_snapshot_get,
+			      mlx4_devlink_crdump_snapshot_set, NULL),
 	DEVLINK_PARAM_DRIVER(MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE,
 			     "enable_64b_cqe_eqe", DEVLINK_PARAM_TYPE_BOOL,
 			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
@@ -270,6 +295,11 @@ static void mlx4_devlink_set_params_init_values(struct devlink *devlink)
 	mlx4_devlink_set_init_value(devlink,
 				    MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR,
 				    value);
+
+	value.vbool = false;
+	mlx4_devlink_set_init_value(devlink,
+				    DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+				    value);
 }
 
 static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev,
@@ -3862,6 +3892,9 @@ static int mlx4_devlink_port_type_set(struct devlink_port *devlink_port,
 
 static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink)
 {
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
 	union devlink_param_value saved_value;
 	int err;
 
@@ -3889,6 +3922,14 @@ static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink)
 						 &saved_value);
 	if (!err)
 		enable_4k_uar = saved_value.vbool;
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+						 &saved_value);
+	if (!err && crdump->snapshot_enable != saved_value.vbool) {
+		crdump->snapshot_enable = saved_value.vbool;
+		devlink_param_value_changed(devlink,
+					    DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT);
+	}
 }
 
 static int mlx4_devlink_reload(struct devlink *devlink,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 300b944e6e1e..dca6ab4eaa99 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -853,6 +853,7 @@ struct mlx4_vf_dev {
 };
 
 struct mlx4_fw_crdump {
+	bool snapshot_enable;
 	struct devlink_region *region_crspace;
 	struct devlink_region *region_fw_health;
 };
-- 
cgit v1.2.3


From afed7bcf9487bb28e2e2b016a195085c07416c0b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 11 Jul 2018 10:36:07 +0100
Subject: locking/refcount: Always allow checked forms

In many cases, it would be useful to be able to use the full
sanity-checked refcount helpers regardless of CONFIG_REFCOUNT_FULL,
as this would help to avoid duplicate warnings where callers try to
sanity-check refcount manipulation.

This patch refactors things such that the full refcount helpers were
always built, as refcount_${op}_checked(), such that they can be used
regardless of CONFIG_REFCOUNT_FULL. This will allow code which *always*
wants a checked refcount to opt-in, avoiding the need to duplicate the
logic for warnings.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180711093607.1644-1-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h | 27 +++++++++++++++++-------
 lib/refcount.c           | 53 +++++++++++++++++++++++-------------------------
 2 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index c36addd27dd5..53c5eca24d83 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -43,17 +43,30 @@ static inline unsigned int refcount_read(const refcount_t *r)
 	return atomic_read(&r->refs);
 }
 
+extern __must_check bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r);
+extern void refcount_add_checked(unsigned int i, refcount_t *r);
+
+extern __must_check bool refcount_inc_not_zero_checked(refcount_t *r);
+extern void refcount_inc_checked(refcount_t *r);
+
+extern __must_check bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r);
+
+extern __must_check bool refcount_dec_and_test_checked(refcount_t *r);
+extern void refcount_dec_checked(refcount_t *r);
+
 #ifdef CONFIG_REFCOUNT_FULL
-extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
-extern void refcount_add(unsigned int i, refcount_t *r);
 
-extern __must_check bool refcount_inc_not_zero(refcount_t *r);
-extern void refcount_inc(refcount_t *r);
+#define refcount_add_not_zero	refcount_add_not_zero_checked
+#define refcount_add		refcount_add_checked
+
+#define refcount_inc_not_zero	refcount_inc_not_zero_checked
+#define refcount_inc		refcount_inc_checked
+
+#define refcount_sub_and_test	refcount_sub_and_test_checked
 
-extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
+#define refcount_dec_and_test	refcount_dec_and_test_checked
+#define refcount_dec		refcount_dec_checked
 
-extern __must_check bool refcount_dec_and_test(refcount_t *r);
-extern void refcount_dec(refcount_t *r);
 #else
 # ifdef CONFIG_ARCH_HAS_REFCOUNT
 #  include <asm/refcount.h>
diff --git a/lib/refcount.c b/lib/refcount.c
index 4bd842f20749..5c4aaefc0682 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -40,10 +40,8 @@
 #include <linux/spinlock.h>
 #include <linux/bug.h>
 
-#ifdef CONFIG_REFCOUNT_FULL
-
 /**
- * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * refcount_add_not_zero_checked - add a value to a refcount unless it is 0
  * @i: the value to add to the refcount
  * @r: the refcount
  *
@@ -60,7 +58,7 @@
  *
  * Return: false if the passed refcount is 0, true otherwise
  */
-bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -81,10 +79,10 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 
 	return true;
 }
-EXPORT_SYMBOL(refcount_add_not_zero);
+EXPORT_SYMBOL(refcount_add_not_zero_checked);
 
 /**
- * refcount_add - add a value to a refcount
+ * refcount_add_checked - add a value to a refcount
  * @i: the value to add to the refcount
  * @r: the refcount
  *
@@ -99,14 +97,14 @@ EXPORT_SYMBOL(refcount_add_not_zero);
  * cases, refcount_inc(), or one of its variants, should instead be used to
  * increment a reference count.
  */
-void refcount_add(unsigned int i, refcount_t *r)
+void refcount_add_checked(unsigned int i, refcount_t *r)
 {
-	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
+	WARN_ONCE(!refcount_add_not_zero_checked(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
-EXPORT_SYMBOL(refcount_add);
+EXPORT_SYMBOL(refcount_add_checked);
 
 /**
- * refcount_inc_not_zero - increment a refcount unless it is 0
+ * refcount_inc_not_zero_checked - increment a refcount unless it is 0
  * @r: the refcount to increment
  *
  * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
@@ -117,7 +115,7 @@ EXPORT_SYMBOL(refcount_add);
  *
  * Return: true if the increment was successful, false otherwise
  */
-bool refcount_inc_not_zero(refcount_t *r)
+bool refcount_inc_not_zero_checked(refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -136,10 +134,10 @@ bool refcount_inc_not_zero(refcount_t *r)
 
 	return true;
 }
-EXPORT_SYMBOL(refcount_inc_not_zero);
+EXPORT_SYMBOL(refcount_inc_not_zero_checked);
 
 /**
- * refcount_inc - increment a refcount
+ * refcount_inc_checked - increment a refcount
  * @r: the refcount to increment
  *
  * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
@@ -150,14 +148,14 @@ EXPORT_SYMBOL(refcount_inc_not_zero);
  * Will WARN if the refcount is 0, as this represents a possible use-after-free
  * condition.
  */
-void refcount_inc(refcount_t *r)
+void refcount_inc_checked(refcount_t *r)
 {
-	WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+	WARN_ONCE(!refcount_inc_not_zero_checked(r), "refcount_t: increment on 0; use-after-free.\n");
 }
-EXPORT_SYMBOL(refcount_inc);
+EXPORT_SYMBOL(refcount_inc_checked);
 
 /**
- * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * refcount_sub_and_test_checked - subtract from a refcount and test if it is 0
  * @i: amount to subtract from the refcount
  * @r: the refcount
  *
@@ -176,7 +174,7 @@ EXPORT_SYMBOL(refcount_inc);
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r)
 {
 	unsigned int new, val = atomic_read(&r->refs);
 
@@ -194,10 +192,10 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 
 	return !new;
 }
-EXPORT_SYMBOL(refcount_sub_and_test);
+EXPORT_SYMBOL(refcount_sub_and_test_checked);
 
 /**
- * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * refcount_dec_and_test_checked - decrement a refcount and test if it is 0
  * @r: the refcount
  *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
@@ -209,14 +207,14 @@ EXPORT_SYMBOL(refcount_sub_and_test);
  *
  * Return: true if the resulting refcount is 0, false otherwise
  */
-bool refcount_dec_and_test(refcount_t *r)
+bool refcount_dec_and_test_checked(refcount_t *r)
 {
-	return refcount_sub_and_test(1, r);
+	return refcount_sub_and_test_checked(1, r);
 }
-EXPORT_SYMBOL(refcount_dec_and_test);
+EXPORT_SYMBOL(refcount_dec_and_test_checked);
 
 /**
- * refcount_dec - decrement a refcount
+ * refcount_dec_checked - decrement a refcount
  * @r: the refcount
  *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
@@ -225,12 +223,11 @@ EXPORT_SYMBOL(refcount_dec_and_test);
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
  */
-void refcount_dec(refcount_t *r)
+void refcount_dec_checked(refcount_t *r)
 {
-	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
+	WARN_ONCE(refcount_dec_and_test_checked(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
-EXPORT_SYMBOL(refcount_dec);
-#endif /* CONFIG_REFCOUNT_FULL */
+EXPORT_SYMBOL(refcount_dec_checked);
 
 /**
  * refcount_dec_if_one - decrement a refcount if it is 1
-- 
cgit v1.2.3


From 05814a10370b3252fe2b0898b6adac3cdd531096 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Fri, 13 Jul 2018 17:07:26 +0300
Subject: block: remove blkdev_entry_to_request() macro

Remove blkdev_entry_to_request() macro, which remained unused through
the observable history, also note that it repeats list_entry_rq() macro
verbatim.

Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 137759862f07..1939ed95f936 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1436,8 +1436,6 @@ enum blk_default_limits {
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 
-#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
-
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
 	return q->limits.seg_boundary_mask;
-- 
cgit v1.2.3


From 8e1b706b6e819bed215c0db16345568864660393 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 13 Jul 2018 16:23:23 +0200
Subject: cpu/hotplug: Expose SMT control init function

The L1TF mitigation will gain a commend line parameter which allows to set
a combination of hypervisor mitigation and SMT control.

Expose cpu_smt_disable() so the command line parser can tweak SMT settings.

[ tglx: Split out of larger patch and made it preserve an already existing
  	force off state ]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20180713142323.039715135@linutronix.de
---
 include/linux/cpu.h |  2 ++
 kernel/cpu.c        | 16 +++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 7532cbf27b1d..c9b23ad27b38 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -177,8 +177,10 @@ enum cpuhp_smt_control {
 
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
+extern void cpu_smt_disable(bool force);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
+static inline void cpu_smt_disable(bool force) { }
 #endif
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d79e24df2420..8453e31f2d1a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -347,13 +347,23 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
 EXPORT_SYMBOL_GPL(cpu_smt_control);
 
-static int __init smt_cmdline_disable(char *str)
+void __init cpu_smt_disable(bool force)
 {
-	cpu_smt_control = CPU_SMT_DISABLED;
-	if (str && !strcmp(str, "force")) {
+	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+		cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+		return;
+
+	if (force) {
 		pr_info("SMT: Force disabled\n");
 		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+	} else {
+		cpu_smt_control = CPU_SMT_DISABLED;
 	}
+}
+
+static int __init smt_cmdline_disable(char *str)
+{
+	cpu_smt_disable(str && !strcmp(str, "force"));
 	return 0;
 }
 early_param("nosmt", smt_cmdline_disable);
-- 
cgit v1.2.3


From fee0aede6f4739c87179eca76136f83210953b86 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Jul 2018 16:23:24 +0200
Subject: cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early

The CPU_SMT_NOT_SUPPORTED state is set (if the processor does not support
SMT) when the sysfs SMT control file is initialized.

That was fine so far as this was only required to make the output of the
control file correct and to prevent writes in that case.

With the upcoming l1tf command line parameter, this needs to be set up
before the L1TF mitigation selection and command line parsing happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20180713142323.121795971@linutronix.de
---
 arch/x86/kernel/cpu/bugs.c |  6 ++++++
 include/linux/cpu.h        |  2 ++
 kernel/cpu.c               | 13 ++++++++++---
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7df7df9a2753..4be69a7bef0b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -58,6 +58,12 @@ void __init check_bugs(void)
 {
 	identify_boot_cpu();
 
+	/*
+	 * identify_boot_cpu() initialized SMT support information, let the
+	 * core code know.
+	 */
+	cpu_smt_check_topology();
+
 	if (!IS_ENABLED(CONFIG_SMP)) {
 		pr_info("CPU: ");
 		print_cpu_info(&boot_cpu_data);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c9b23ad27b38..ccdf3a67ce56 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -178,9 +178,11 @@ enum cpuhp_smt_control {
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
 extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology(void);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
 static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology(void) { }
 #endif
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8453e31f2d1a..37eec872042b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -361,6 +361,16 @@ void __init cpu_smt_disable(bool force)
 	}
 }
 
+/*
+ * The decision whether SMT is supported can only be done after the full
+ * CPU identification. Called from architecture code.
+ */
+void __init cpu_smt_check_topology(void)
+{
+	if (!topology_smt_supported())
+		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
 static int __init smt_cmdline_disable(char *str)
 {
 	cpu_smt_disable(str && !strcmp(str, "force"));
@@ -2115,9 +2125,6 @@ static const struct attribute_group cpuhp_smt_attr_group = {
 
 static int __init cpu_smt_state_init(void)
 {
-	if (!topology_smt_supported())
-		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-
 	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
 				  &cpuhp_smt_attr_group);
 }
-- 
cgit v1.2.3


From 6b8675897338f874c41612655a85d8e10cdb23d8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:39 -0700
Subject: xdp: don't make drivers report attachment mode

prog_attached of struct netdev_bpf should have been superseded
by simply setting prog_id long time ago, but we kept it around
to allow offloading drivers to communicate attachment mode (drv
vs hw).  Subsequently drivers were also allowed to report back
attachment flags (prog_flags), and since nowadays only programs
attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell
the attachment mode from the flags driver reports.  Remove
prog_attached member.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c       | 1 -
 drivers/net/ethernet/cavium/thunder/nicvf_main.c    | 1 -
 drivers/net/ethernet/intel/i40e/i40e_main.c         | 1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c       | 1 -
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c   | 1 -
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c      | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 1 -
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 ---
 drivers/net/ethernet/qlogic/qede/qede_filter.c      | 1 -
 drivers/net/netdevsim/bpf.c                         | 1 -
 drivers/net/tun.c                                   | 1 -
 drivers/net/virtio_net.c                            | 1 -
 include/linux/netdevice.h                           | 5 -----
 net/core/dev.c                                      | 7 +++----
 net/core/rtnetlink.c                                | 8 ++++++--
 15 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 1f0e872d0667..0584d07c8c33 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -219,7 +219,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		rc = bnxt_xdp_set(bp, xdp->prog);
 		break;
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!bp->xdp_prog;
 		xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0;
 		rc = 0;
 		break;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 135766c4296b..768f584f8392 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1848,7 +1848,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return nicvf_xdp_setup(nic, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!nic->xdp_prog;
 		xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 426b0ccb1fc6..51762428b40e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11841,7 +11841,6 @@ static int i40e_xdp(struct net_device *dev,
 	case XDP_SETUP_PROG:
 		return i40e_xdp_setup(vsi, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
 		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a8e21becb619..3862fea1c923 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9966,7 +9966,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return ixgbe_xdp_setup(dev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!(adapter->xdp_prog);
 		xdp->prog_id = adapter->xdp_prog ?
 			adapter->xdp_prog->aux->id : 0;
 		return 0;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 59416eddd840..d86446d202d5 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4462,7 +4462,6 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return ixgbevf_xdp_setup(dev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!(adapter->xdp_prog);
 		xdp->prog_id = adapter->xdp_prog ?
 			       adapter->xdp_prog->aux->id : 0;
 		return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 65eb06e017e4..6785661d1a72 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2926,7 +2926,6 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return mlx4_xdp_set(dev, xdp->prog);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = mlx4_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bbd2fd0b2e06..e4a9a0768a81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4192,7 +4192,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return mlx5e_xdp_set(dev, xdp->prog);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = mlx5e_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 7df5ca37bfb8..d20714598613 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3459,9 +3459,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 		return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags,
 					 xdp->extack);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!nn->xdp_prog;
-		if (nn->dp.bpf_offload_xdp)
-			xdp->prog_attached = XDP_ATTACHED_HW;
 		xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0;
 		xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0;
 		return 0;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index b823bfe2ea4d..f9a327c821eb 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1116,7 +1116,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		return qede_xdp_set(edev, xdp->prog);
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!edev->xdp_prog;
 		xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0;
 		return 0;
 	default:
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 75c25306d234..712e6f918065 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -567,7 +567,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
 	case XDP_QUERY_PROG:
-		bpf->prog_attached = ns->xdp_prog_mode;
 		bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0;
 		bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0;
 		return 0;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a192a017cc68..49a50219d0da 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1268,7 +1268,6 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = tun_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 53085c63277b..2ff08bc103a9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2343,7 +2343,6 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
 	case XDP_QUERY_PROG:
 		xdp->prog_id = virtnet_xdp_query(dev);
-		xdp->prog_attached = !!xdp->prog_id;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971e500d..69a664789b33 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -819,10 +819,6 @@ enum bpf_netdev_command {
 	 */
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
-	/* Check if a bpf program is set on the device.  The callee should
-	 * set @prog_attached to one of XDP_ATTACHED_* values, note that "true"
-	 * is equivalent to XDP_ATTACHED_DRV.
-	 */
 	XDP_QUERY_PROG,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_VERIFIER_PREP,
@@ -849,7 +845,6 @@ struct netdev_bpf {
 		};
 		/* XDP_QUERY_PROG */
 		struct {
-			u8 prog_attached;
 			u32 prog_id;
 			/* flags with which program was installed */
 			u32 prog_flags;
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..9fa3b3705a8e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4926,7 +4926,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		break;
 
 	case XDP_QUERY_PROG:
-		xdp->prog_attached = !!old;
 		xdp->prog_id = old ? old->aux->id : 0;
 		break;
 
@@ -7593,13 +7592,13 @@ void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 	WARN_ON(bpf_op(dev, xdp) < 0);
 }
 
-static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
+static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
 {
 	struct netdev_bpf xdp;
 
 	__dev_xdp_query(dev, bpf_op, &xdp);
 
-	return xdp.prog_attached;
+	return xdp.prog_id;
 }
 
 static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7634,7 +7633,7 @@ static void dev_xdp_uninstall(struct net_device *dev)
 		return;
 
 	__dev_xdp_query(dev, ndo_bpf, &xdp);
-	if (xdp.prog_attached == XDP_ATTACHED_NONE)
+	if (!xdp.prog_id)
 		return;
 
 	/* Program removal should always succeed */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b40242459907..02ebc056a688 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1372,9 +1372,13 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
 		return XDP_ATTACHED_NONE;
 
 	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
-	*prog_id = xdp.prog_id;
+	if (!xdp.prog_id)
+		return XDP_ATTACHED_NONE;
 
-	return xdp.prog_attached;
+	*prog_id = xdp.prog_id;
+	if (xdp.prog_flags & XDP_FLAGS_HW_MODE)
+		return XDP_ATTACHED_HW;
+	return XDP_ATTACHED_DRV;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 11 Jul 2018 20:36:41 -0700
Subject: xdp: support simultaneous driver and hw XDP attachment

Split the query of HW-attached program from the software one.
Introduce new .ndo_bpf command to query HW-attached program.
This will allow drivers to install different programs in HW
and SW at the same time.  Netlink can now also carry multiple
programs on dump (in which case mode will be set to
XDP_ATTACHED_MULTI and user has to check per-attachment point
attributes, IFLA_XDP_PROG_ID will not be present).  We reuse
IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size()
doesn't need to be updated.

Note that the installation side is still not there, since all
drivers currently reject installing more than one program at
the time.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  6 ++
 drivers/net/netdevsim/bpf.c                        |  6 ++
 include/linux/netdevice.h                          |  7 +-
 include/uapi/linux/if_link.h                       |  1 +
 net/core/dev.c                                     | 45 ++++++-----
 net/core/rtnetlink.c                               | 93 ++++++++++++----------
 6 files changed, 96 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4bb589dbffbc..bb1e72e8dbc2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3453,6 +3453,12 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG_HW:
 		return nfp_net_xdp_setup(nn, xdp);
 	case XDP_QUERY_PROG:
+		if (nn->dp.bpf_offload_xdp)
+			return 0;
+		return xdp_attachment_query(&nn->xdp, xdp);
+	case XDP_QUERY_PROG_HW:
+		if (!nn->dp.bpf_offload_xdp)
+			return 0;
 		return xdp_attachment_query(&nn->xdp, xdp);
 	default:
 		return nfp_app_bpf(nn->app, nn, xdp);
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index c485d97b5df4..5544c9b51173 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -561,6 +561,12 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 		nsim_bpf_destroy_prog(bpf->offload.prog);
 		return 0;
 	case XDP_QUERY_PROG:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_DRV)
+			return 0;
+		return xdp_attachment_query(&ns->xdp, bpf);
+	case XDP_QUERY_PROG_HW:
+		if (ns->xdp_prog_mode != XDP_ATTACHED_HW)
+			return 0;
 		return xdp_attachment_query(&ns->xdp, bpf);
 	case XDP_SETUP_PROG:
 		err = nsim_setup_prog_checks(ns, bpf);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69a664789b33..2422c0e88f5c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -820,6 +820,7 @@ enum bpf_netdev_command {
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
 	XDP_QUERY_PROG,
+	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_VERIFIER_PREP,
 	BPF_OFFLOAD_TRANSLATE,
@@ -843,7 +844,7 @@ struct netdev_bpf {
 			struct bpf_prog *prog;
 			struct netlink_ext_ack *extack;
 		};
-		/* XDP_QUERY_PROG */
+		/* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */
 		struct {
 			u32 prog_id;
 			/* flags with which program was installed */
@@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
-		     struct netdev_bpf *xdp);
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
+		    enum bpf_netdev_command cmd);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bc86c2b105ec..8759cfb8aa2e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -920,6 +920,7 @@ enum {
 	XDP_ATTACHED_DRV,
 	XDP_ATTACHED_SKB,
 	XDP_ATTACHED_HW,
+	XDP_ATTACHED_MULTI,
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 9fa3b3705a8e..993cdc3cd086 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7582,21 +7582,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
-		     struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+		    enum bpf_netdev_command cmd)
 {
-	memset(xdp, 0, sizeof(*xdp));
-	xdp->command = XDP_QUERY_PROG;
+	struct netdev_bpf xdp;
 
-	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, xdp) < 0);
-}
+	if (!bpf_op)
+		return 0;
 
-static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
-	struct netdev_bpf xdp;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = cmd;
 
-	__dev_xdp_query(dev, bpf_op, &xdp);
+	/* Query must always succeed. */
+	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 
 	return xdp.prog_id;
 }
@@ -7632,12 +7630,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
 	if (!ndo_bpf)
 		return;
 
-	__dev_xdp_query(dev, ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return;
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+	WARN_ON(ndo_bpf(dev, &xdp));
+	if (xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 
-	/* Program removal should always succeed */
-	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+	/* Remove HW offload */
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG_HW;
+	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+					NULL));
 }
 
 /**
@@ -7653,12 +7658,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	enum bpf_netdev_command query;
 	struct bpf_prog *prog = NULL;
 	bpf_op_t bpf_op, bpf_chk;
 	int err;
 
 	ASSERT_RTNL();
 
+	query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
 	bpf_op = bpf_chk = ops->ndo_bpf;
 	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 		return -EOPNOTSUPP;
@@ -7668,10 +7676,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		bpf_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+		if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+		    __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
 			return -EEXIST;
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-		    __dev_xdp_attached(dev, bpf_op))
+		    __dev_xdp_query(dev, bpf_op, query))
 			return -EBUSY;
 
 		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 02ebc056a688..c9929ef17539 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,7 @@ static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
 			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4) +	/* XDP_PROG_ID */
+			  nla_total_size(4) +	/* XDP_PROG_ID (or 1st mode) */
 			  nla_total_size(4);	/* XDP_<mode>_PROG_ID */
 
 	return xdp_size;
@@ -1354,37 +1354,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	const struct bpf_prog *generic_xdp_prog;
-	struct netdev_bpf xdp;
 
 	ASSERT_RTNL();
 
-	*prog_id = 0;
 	generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
-	if (generic_xdp_prog) {
-		*prog_id = generic_xdp_prog->aux->id;
-		return XDP_ATTACHED_SKB;
-	}
-	if (!ops->ndo_bpf)
-		return XDP_ATTACHED_NONE;
+	if (!generic_xdp_prog)
+		return 0;
+	return generic_xdp_prog->aux->id;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+			       XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+			       u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+			       u32 (*get_prog_id)(struct net_device *dev))
+{
+	u32 curr_id;
+	int err;
+
+	curr_id = get_prog_id(dev);
+	if (!curr_id)
+		return 0;
+
+	*prog_id = curr_id;
+	err = nla_put_u32(skb, attr, curr_id);
+	if (err)
+		return err;
 
-	__dev_xdp_query(dev, ops->ndo_bpf, &xdp);
-	if (!xdp.prog_id)
-		return XDP_ATTACHED_NONE;
+	if (*mode != XDP_ATTACHED_NONE)
+		*mode = XDP_ATTACHED_MULTI;
+	else
+		*mode = tgt_mode;
 
-	*prog_id = xdp.prog_id;
-	if (xdp.prog_flags & XDP_FLAGS_HW_MODE)
-		return XDP_ATTACHED_HW;
-	return XDP_ATTACHED_DRV;
+	return 0;
 }
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
-	u32 prog_attr, prog_id;
 	struct nlattr *xdp;
+	u32 prog_id;
 	int err;
 	u8 mode;
 
@@ -1392,35 +1412,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 	if (!xdp)
 		return -EMSGSIZE;
 
-	mode = rtnl_xdp_attached_mode(dev, &prog_id);
+	prog_id = 0;
+	mode = XDP_ATTACHED_NONE;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+				IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+				IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv))
+		goto err_cancel;
+	if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+				IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw))
+		goto err_cancel;
+
 	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
 	if (err)
 		goto err_cancel;
 
-	if (prog_id) {
+	if (prog_id && mode != XDP_ATTACHED_MULTI) {
 		err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
 		if (err)
 			goto err_cancel;
-
-		switch (mode) {
-		case XDP_ATTACHED_DRV:
-			prog_attr = IFLA_XDP_DRV_PROG_ID;
-			break;
-		case XDP_ATTACHED_SKB:
-			prog_attr = IFLA_XDP_SKB_PROG_ID;
-			break;
-		case XDP_ATTACHED_HW:
-			prog_attr = IFLA_XDP_HW_PROG_ID;
-			break;
-		case XDP_ATTACHED_NONE:
-		default:
-			err = -EINVAL;
-			goto err_cancel;
-		}
-
-		err = nla_put_u32(skb, prog_attr, prog_id);
-		if (err)
-			goto err_cancel;
 	}
 
 	nla_nest_end(skb, xdp);
-- 
cgit v1.2.3


From c921c2077b32081617789a645120148bc8b60c98 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 13 Jul 2018 12:16:43 +0300
Subject: net: ipmr: add support for passing full packet on wrong vif

This patch adds support for IGMPMSG_WRVIFWHOLE which is used to pass
full packet and real vif id when the incoming interface is wrong.
While the RP and FHR are setting up state we need to be sending the
registers encapsulated with all the data inside otherwise we lose it.
The RP then decapsulates it and forwards it to the interested parties.
Currently with WRONGVIF we can only be sending empty register packets
and will lose that data.
This behaviour can be enabled by using MRT_PIM with
val == IGMPMSG_WRVIFWHOLE. This doesn't prevent IGMPMSG_WRONGVIF from
happening, it happens in addition to it, also it is controlled by the same
throttling parameters as WRONGVIF (i.e. 1 packet per 3 seconds currently).
Both messages are generated to keep backwards compatibily and avoid
breaking someone who was enabling MRT_PIM with val == 4, since any
positive val is accepted and treated the same.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h |  1 +
 include/uapi/linux/mroute.h |  2 ++
 net/ipv4/ipmr.c             | 21 ++++++++++++++++-----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index fd436cdd4725..6675b9f81979 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -254,6 +254,7 @@ struct mr_table {
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
 	bool			mroute_do_pim;
+	bool			mroute_do_wrvifwhole;
 	int			mroute_reg_vif_num;
 };
 
diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index 10f9ff9426a2..5d37a9ccce63 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -120,6 +120,7 @@ enum {
 	IPMRA_TABLE_MROUTE_DO_ASSERT,
 	IPMRA_TABLE_MROUTE_DO_PIM,
 	IPMRA_TABLE_VIFS,
+	IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
 	__IPMRA_TABLE_MAX
 };
 #define IPMRA_TABLE_MAX (__IPMRA_TABLE_MAX - 1)
@@ -173,5 +174,6 @@ enum {
 #define IGMPMSG_NOCACHE		1		/* Kern cache fill request to mrouted */
 #define IGMPMSG_WRONGVIF	2		/* For PIM assert processing (unused) */
 #define IGMPMSG_WHOLEPKT	3		/* For PIM Register processing */
+#define IGMPMSG_WRVIFWHOLE	4		/* For PIM Register and assert processing */
 
 #endif /* _UAPI__LINUX_MROUTE_H */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 82f914122f1b..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	struct sk_buff *skb;
 	int ret;
 
-	if (assert == IGMPMSG_WHOLEPKT)
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
 		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
 	else
 		skb = alloc_skb(128, GFP_ATOMIC);
@@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	if (!skb)
 		return -ENOBUFS;
 
-	if (assert == IGMPMSG_WHOLEPKT) {
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
 		/* Ugly, but we have no choice with this interface.
 		 * Duplicate old header, fix ihl, length etc.
 		 * And all this only to mangle msg->im_msgtype and
@@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
 		skb_reset_transport_header(skb);
 		msg = (struct igmpmsg *)skb_network_header(skb);
 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
-		msg->im_msgtype = IGMPMSG_WHOLEPKT;
+		msg->im_msgtype = assert;
 		msg->im_mbz = 0;
-		msg->im_vif = mrt->mroute_reg_vif_num;
+		if (assert == IGMPMSG_WRVIFWHOLE)
+			msg->im_vif = vifi;
+		else
+			msg->im_vif = mrt->mroute_reg_vif_num;
 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 					     sizeof(struct iphdr));
@@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 	struct mr_table *mrt;
 	struct vifctl vif;
 	struct mfcctl mfc;
+	bool do_wrvifwhole;
 	u32 uval;
 
 	/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 			break;
 		}
 
+		do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
 		val = !!val;
 		if (val != mrt->mroute_do_pim) {
 			mrt->mroute_do_pim = val;
 			mrt->mroute_do_assert = val;
+			mrt->mroute_do_wrvifwhole = do_wrvifwhole;
 		}
 		break;
 	case MRT_TABLE:
@@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			       MFC_ASSERT_THRESH)) {
 			c->_c.mfc_un.res.last_assert = jiffies;
 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+			if (mrt->mroute_do_wrvifwhole)
+				ipmr_cache_report(mrt, skb, true_vifi,
+						  IGMPMSG_WRVIFWHOLE);
 		}
 		goto dont_forward;
 	}
@@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
 			mrt->mroute_reg_vif_num) ||
 	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
 		       mrt->mroute_do_assert) ||
-	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+		       mrt->mroute_do_wrvifwhole))
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From 5fd778915ad29184a5ff8eb82d1118f6916b79e4 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 28 Jun 2018 17:45:14 +0200
Subject: sched/sysctl: Remove unused sched_time_avg_ms sysctl

/proc/sys/kernel/sched_time_avg_ms entry is not used anywhere,
remove it.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: claudio@evidence.eu.com
Cc: daniel.lezcano@linaro.org
Cc: dietmar.eggemann@arm.com
Cc: joel@joelfernandes.org
Cc: juri.lelli@redhat.com
Cc: luca.abeni@santannapisa.it
Cc: patrick.bellasi@arm.com
Cc: quentin.perret@arm.com
Cc: rjw@rjwysocki.net
Cc: valentin.schneider@arm.com
Cc: viresh.kumar@linaro.org
Link: http://lkml.kernel.org/r/1530200714-4504-12-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 1 -
 kernel/sched/core.c          | 8 --------
 kernel/sched/sched.h         | 1 -
 kernel/sysctl.c              | 8 --------
 4 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1c1a1512ec55..913488d828cb 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-extern __read_mostly unsigned int sysctl_sched_time_avg;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a691b07390ab..ba6bb805693a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -46,14 +46,6 @@ const_debug unsigned int sysctl_sched_features =
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
-/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
 /*
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14aac2d2de80..ebb4b3c3ece7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1713,7 +1713,6 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 
-extern const_debug unsigned int sysctl_sched_time_avg;
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d9837c0aff4..f22f76b7a138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "sched_time_avg_ms",
-		.data		= &sysctl_sched_time_avg,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-	},
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",
-- 
cgit v1.2.3


From 788faab70d5a882693286b8d5022779559c79904 Mon Sep 17 00:00:00 2001
From: Tobias Tefke <tobias.tefke@gmail.com>
Date: Mon, 9 Jul 2018 12:57:15 +0200
Subject: perf, tools: Use correct articles in comments

Some of the comments in the perf events code use articles incorrectly,
using 'a' for words beginning with a vowel sound, where 'an' should be
used.

Signed-off-by: Tobias Tefke <tobias.tefke@tutanota.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: alexander.shishkin@linux.intel.com
Cc: jolsa@redhat.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/20180709105715.22938-1-tobias.tefke@tutanota.com
[ Fix a few more perf related 'a event' typo fixes from all around the kernel and tooling tree. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/perf/core-book3s.c          |  6 +++---
 include/linux/perf_event.h               |  2 +-
 kernel/events/core.c                     | 16 ++++++++--------
 kernel/events/uprobes.c                  |  6 +++---
 tools/perf/Documentation/perf-list.txt   |  2 +-
 tools/perf/Documentation/perf-record.txt |  4 ++--
 6 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 3f66fcf8ad99..19d8ab49d1bd 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1469,7 +1469,7 @@ static int collect_events(struct perf_event *group, int max_count,
 }
 
 /*
- * Add a event to the PMU.
+ * Add an event to the PMU.
  * If all events are not already frozen, then we disable and
  * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
@@ -1548,7 +1548,7 @@ nocheck:
 }
 
 /*
- * Remove a event from the PMU.
+ * Remove an event from the PMU.
  */
 static void power_pmu_del(struct perf_event *event, int ef_flags)
 {
@@ -1742,7 +1742,7 @@ static int power_pmu_commit_txn(struct pmu *pmu)
 /*
  * Return 1 if we might be able to put event on a limited PMC,
  * or 0 if not.
- * A event can only go on a limited PMC if it counts something
+ * An event can only go on a limited PMC if it counts something
  * that a limited PMC can count, doesn't require interrupts, and
  * doesn't exclude any processor mode.
  */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1fa12887ec02..e6dd3a2f8ec4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -490,7 +490,7 @@ struct perf_addr_filters_head {
 };
 
 /**
- * enum perf_event_state - the states of a event
+ * enum perf_event_state - the states of an event:
  */
 enum perf_event_state {
 	PERF_EVENT_STATE_DEAD		= -4,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a4d5ac6d74af..86b31e5a5a9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1656,7 +1656,7 @@ perf_event_groups_next(struct perf_event *event)
 				typeof(*event), group_node))
 
 /*
- * Add a event from the lists for its context.
+ * Add an event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
@@ -1844,7 +1844,7 @@ static void perf_group_attach(struct perf_event *event)
 }
 
 /*
- * Remove a event from the lists for its context.
+ * Remove an event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
@@ -2148,7 +2148,7 @@ static void __perf_event_disable(struct perf_event *event,
 }
 
 /*
- * Disable a event.
+ * Disable an event.
  *
  * If event->ctx is a cloned context, callers must make sure that
  * every task struct that event->ctx->task could possibly point to
@@ -2677,7 +2677,7 @@ static void __perf_event_enable(struct perf_event *event,
 }
 
 /*
- * Enable a event.
+ * Enable an event.
  *
  * If event->ctx is a cloned context, callers must make sure that
  * every task struct that event->ctx->task could possibly point to
@@ -2755,7 +2755,7 @@ static int __perf_event_stop(void *info)
 	 * events will refuse to restart because of rb::aux_mmap_count==0,
 	 * see comments in perf_aux_output_begin().
 	 *
-	 * Since this is happening on a event-local CPU, no trace is lost
+	 * Since this is happening on an event-local CPU, no trace is lost
 	 * while restarting.
 	 */
 	if (sd->restart)
@@ -4827,7 +4827,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
 	int ret;
 
 	/*
-	 * Return end-of-file for a read on a event that is in
+	 * Return end-of-file for a read on an event that is in
 	 * error state (i.e. because it was pinned but it couldn't be
 	 * scheduled on to the CPU at some point).
 	 */
@@ -9898,7 +9898,7 @@ enabled:
 }
 
 /*
- * Allocate and initialize a event structure
+ * Allocate and initialize an event structure
  */
 static struct perf_event *
 perf_event_alloc(struct perf_event_attr *attr, int cpu,
@@ -11229,7 +11229,7 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 }
 
 /*
- * Inherit a event from parent task to child task.
+ * Inherit an event from parent task to child task.
  *
  * Returns:
  *  - valid pointer on success
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ccc579a7d32e..aed1ba569954 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -918,7 +918,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 EXPORT_SYMBOL_GPL(uprobe_register);
 
 /*
- * uprobe_apply - unregister a already registered probe.
+ * uprobe_apply - unregister an already registered probe.
  * @inode: the file in which the probe has to be removed.
  * @offset: offset from the start of the file.
  * @uc: consumer which wants to add more or remove some breakpoints
@@ -947,7 +947,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
 }
 
 /*
- * uprobe_unregister - unregister a already registered probe.
+ * uprobe_unregister - unregister an already registered probe.
  * @inode: the file in which the probe has to be removed.
  * @offset: offset from the start of the file.
  * @uc: identify which probe if multiple probes are colocated.
@@ -1403,7 +1403,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri)
 
 /*
  * Called with no locks held.
- * Called in context of a exiting or a exec-ing thread.
+ * Called in context of an exiting or an exec-ing thread.
  */
 void uprobe_free_utask(struct task_struct *t)
 {
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 11300dbe35c5..14e13512c05f 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -234,7 +234,7 @@ perf also supports group leader sampling using the :S specifier.
   perf record -e '{cycles,instructions}:S' ...
   perf report --group
 
-Normally all events in a event group sample, but with :S only
+Normally all events in an event group sample, but with :S only
 the first event (the leader) samples, and it only reads the values of the
 other events in the group.
 
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 04168da4268e..246dee081efd 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -94,7 +94,7 @@ OPTIONS
 	  "perf report" to view group events together.
 
 --filter=<filter>::
-        Event filter. This option should follow a event selector (-e) which
+        Event filter. This option should follow an event selector (-e) which
 	selects either tracepoint event(s) or a hardware trace PMU
 	(e.g. Intel PT or CoreSight).
 
@@ -153,7 +153,7 @@ OPTIONS
 
 --exclude-perf::
 	Don't record events issued by perf itself. This option should follow
-	a event selector (-e) which selects tracepoint event(s). It adds a
+	an event selector (-e) which selects tracepoint event(s). It adds a
 	filter expression 'common_pid != $PERFPID' to filters. If other
 	'--filter' exists, the new filter expression will be combined with
 	them by '&&'.
-- 
cgit v1.2.3


From 3eb420e70d879ce0e6bf752accf5cdedb0a59de8 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 11 Jul 2018 11:40:35 +0200
Subject: efi: Use a work queue to invoke EFI Runtime Services

Presently, when a user process requests the kernel to execute any
UEFI runtime service, the kernel temporarily switches to a separate
set of page tables that describe the virtual mapping of the UEFI
runtime services regions in memory. Since UEFI runtime services are
typically invoked with interrupts enabled, any code that may be called
during this time, will have an incorrect view of the process's address
space. Although it is unusual for code running in interrupt context to
make assumptions about the process context it runs in, there are cases
(such as the perf subsystem taking samples) where this causes problems.

So let's set up a work queue for calling UEFI runtime services, so that
the actual calls are made when the work queue items are dispatched by a
work queue worker running in a separate kernel thread. Such threads are
not expected to have userland mappings in the first place, and so the
additional mappings created for the UEFI runtime services can never
clash with any.

The ResetSystem() runtime service is not covered by the work queue
handling, since it is not expected to return, and may be called at a
time when the kernel is torn down to the point where we cannot expect
work queues to still be operational.

The non-blocking variants of SetVariable() and QueryVariableInfo()
are also excluded: these are intended to be used from atomic context,
which obviously rules out waiting for a completion to be signalled by
another thread. Note that these variants are currently only used for
UEFI runtime services calls that occur very early in the boot, and
for ones that occur in critical conditions, e.g., to flush kernel logs
to UEFI variables via efi-pstore.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
[ardb: exclude ResetSystem() from the workqueue treatment
       merge from 2 separate patches and rewrite commit log]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180711094040.12506-4-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c              |  14 +++
 drivers/firmware/efi/runtime-wrappers.c | 202 +++++++++++++++++++++++++++++---
 include/linux/efi.h                     |   3 +
 3 files changed, 204 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 232f4915223b..1379a375dfa8 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -84,6 +84,8 @@ struct mm_struct efi_mm = {
 	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
 };
 
+struct workqueue_struct *efi_rts_wq;
+
 static bool disable_runtime;
 static int __init setup_noefi(char *arg)
 {
@@ -337,6 +339,18 @@ static int __init efisubsys_init(void)
 	if (!efi_enabled(EFI_BOOT))
 		return 0;
 
+	/*
+	 * Since we process only one efi_runtime_service() at a time, an
+	 * ordered workqueue (which creates only one execution context)
+	 * should suffice all our needs.
+	 */
+	efi_rts_wq = alloc_ordered_workqueue("efi_rts_wq", 0);
+	if (!efi_rts_wq) {
+		pr_err("Creating efi_rts_wq failed, EFI runtime services disabled.\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return 0;
+	}
+
 	/* We register the efi directory at /sys/firmware/efi */
 	efi_kobj = kobject_create_and_add("efi", firmware_kobj);
 	if (!efi_kobj) {
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index ae54870b2788..aa66cbf23512 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -1,6 +1,15 @@
 /*
  * runtime-wrappers.c - Runtime Services function call wrappers
  *
+ * Implementation summary:
+ * -----------------------
+ * 1. When user/kernel thread requests to execute efi_runtime_service(),
+ * enqueue work to efi_rts_wq.
+ * 2. Caller thread waits for completion until the work is finished
+ * because it's dependent on the return status and execution of
+ * efi_runtime_service().
+ * For instance, get_variable() and get_next_variable().
+ *
  * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * Split off from arch/x86/platform/efi/efi.c
@@ -22,6 +31,9 @@
 #include <linux/mutex.h>
 #include <linux/semaphore.h>
 #include <linux/stringify.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+
 #include <asm/efi.h>
 
 /*
@@ -33,6 +45,76 @@
 #define __efi_call_virt(f, args...) \
 	__efi_call_virt_pointer(efi.systab->runtime, f, args)
 
+/* efi_runtime_service() function identifiers */
+enum efi_rts_ids {
+	GET_TIME,
+	SET_TIME,
+	GET_WAKEUP_TIME,
+	SET_WAKEUP_TIME,
+	GET_VARIABLE,
+	GET_NEXT_VARIABLE,
+	SET_VARIABLE,
+	QUERY_VARIABLE_INFO,
+	GET_NEXT_HIGH_MONO_COUNT,
+	UPDATE_CAPSULE,
+	QUERY_CAPSULE_CAPS,
+};
+
+/*
+ * efi_runtime_work:	Details of EFI Runtime Service work
+ * @arg<1-5>:		EFI Runtime Service function arguments
+ * @status:		Status of executing EFI Runtime Service
+ * @efi_rts_id:		EFI Runtime Service function identifier
+ * @efi_rts_comp:	Struct used for handling completions
+ */
+struct efi_runtime_work {
+	void *arg1;
+	void *arg2;
+	void *arg3;
+	void *arg4;
+	void *arg5;
+	efi_status_t status;
+	struct work_struct work;
+	enum efi_rts_ids efi_rts_id;
+	struct completion efi_rts_comp;
+};
+
+/*
+ * efi_queue_work:	Queue efi_runtime_service() and wait until it's done
+ * @rts:		efi_runtime_service() function identifier
+ * @rts_arg<1-5>:	efi_runtime_service() function arguments
+ *
+ * Accesses to efi_runtime_services() are serialized by a binary
+ * semaphore (efi_runtime_lock) and caller waits until the work is
+ * finished, hence _only_ one work is queued at a time and the caller
+ * thread waits for completion.
+ */
+#define efi_queue_work(_rts, _arg1, _arg2, _arg3, _arg4, _arg5)		\
+({									\
+	struct efi_runtime_work efi_rts_work;				\
+	efi_rts_work.status = EFI_ABORTED;				\
+									\
+	init_completion(&efi_rts_work.efi_rts_comp);			\
+	INIT_WORK_ONSTACK(&efi_rts_work.work, efi_call_rts);		\
+	efi_rts_work.arg1 = _arg1;					\
+	efi_rts_work.arg2 = _arg2;					\
+	efi_rts_work.arg3 = _arg3;					\
+	efi_rts_work.arg4 = _arg4;					\
+	efi_rts_work.arg5 = _arg5;					\
+	efi_rts_work.efi_rts_id = _rts;					\
+									\
+	/*								\
+	 * queue_work() returns 0 if work was already on queue,         \
+	 * _ideally_ this should never happen.                          \
+	 */								\
+	if (queue_work(efi_rts_wq, &efi_rts_work.work))			\
+		wait_for_completion(&efi_rts_work.efi_rts_comp);	\
+	else								\
+		pr_err("Failed to queue work to efi_rts_wq.\n");	\
+									\
+	efi_rts_work.status;						\
+})
+
 void efi_call_virt_check_flags(unsigned long flags, const char *call)
 {
 	unsigned long cur_flags, mismatch;
@@ -90,13 +172,98 @@ void efi_call_virt_check_flags(unsigned long flags, const char *call)
  */
 static DEFINE_SEMAPHORE(efi_runtime_lock);
 
+/*
+ * Calls the appropriate efi_runtime_service() with the appropriate
+ * arguments.
+ *
+ * Semantics followed by efi_call_rts() to understand efi_runtime_work:
+ * 1. If argument was a pointer, recast it from void pointer to original
+ * pointer type.
+ * 2. If argument was a value, recast it from void pointer to original
+ * pointer type and dereference it.
+ */
+static void efi_call_rts(struct work_struct *work)
+{
+	struct efi_runtime_work *efi_rts_work;
+	void *arg1, *arg2, *arg3, *arg4, *arg5;
+	efi_status_t status = EFI_NOT_FOUND;
+
+	efi_rts_work = container_of(work, struct efi_runtime_work, work);
+	arg1 = efi_rts_work->arg1;
+	arg2 = efi_rts_work->arg2;
+	arg3 = efi_rts_work->arg3;
+	arg4 = efi_rts_work->arg4;
+	arg5 = efi_rts_work->arg5;
+
+	switch (efi_rts_work->efi_rts_id) {
+	case GET_TIME:
+		status = efi_call_virt(get_time, (efi_time_t *)arg1,
+				       (efi_time_cap_t *)arg2);
+		break;
+	case SET_TIME:
+		status = efi_call_virt(set_time, (efi_time_t *)arg1);
+		break;
+	case GET_WAKEUP_TIME:
+		status = efi_call_virt(get_wakeup_time, (efi_bool_t *)arg1,
+				       (efi_bool_t *)arg2, (efi_time_t *)arg3);
+		break;
+	case SET_WAKEUP_TIME:
+		status = efi_call_virt(set_wakeup_time, *(efi_bool_t *)arg1,
+				       (efi_time_t *)arg2);
+		break;
+	case GET_VARIABLE:
+		status = efi_call_virt(get_variable, (efi_char16_t *)arg1,
+				       (efi_guid_t *)arg2, (u32 *)arg3,
+				       (unsigned long *)arg4, (void *)arg5);
+		break;
+	case GET_NEXT_VARIABLE:
+		status = efi_call_virt(get_next_variable, (unsigned long *)arg1,
+				       (efi_char16_t *)arg2,
+				       (efi_guid_t *)arg3);
+		break;
+	case SET_VARIABLE:
+		status = efi_call_virt(set_variable, (efi_char16_t *)arg1,
+				       (efi_guid_t *)arg2, *(u32 *)arg3,
+				       *(unsigned long *)arg4, (void *)arg5);
+		break;
+	case QUERY_VARIABLE_INFO:
+		status = efi_call_virt(query_variable_info, *(u32 *)arg1,
+				       (u64 *)arg2, (u64 *)arg3, (u64 *)arg4);
+		break;
+	case GET_NEXT_HIGH_MONO_COUNT:
+		status = efi_call_virt(get_next_high_mono_count, (u32 *)arg1);
+		break;
+	case UPDATE_CAPSULE:
+		status = efi_call_virt(update_capsule,
+				       (efi_capsule_header_t **)arg1,
+				       *(unsigned long *)arg2,
+				       *(unsigned long *)arg3);
+		break;
+	case QUERY_CAPSULE_CAPS:
+		status = efi_call_virt(query_capsule_caps,
+				       (efi_capsule_header_t **)arg1,
+				       *(unsigned long *)arg2, (u64 *)arg3,
+				       (int *)arg4);
+		break;
+	default:
+		/*
+		 * Ideally, we should never reach here because a caller of this
+		 * function should have put the right efi_runtime_service()
+		 * function identifier into efi_rts_work->efi_rts_id
+		 */
+		pr_err("Requested executing invalid EFI Runtime Service.\n");
+	}
+	efi_rts_work->status = status;
+	complete(&efi_rts_work->efi_rts_comp);
+}
+
 static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
 {
 	efi_status_t status;
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(get_time, tm, tc);
+	status = efi_queue_work(GET_TIME, tm, tc, NULL, NULL, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -107,7 +274,7 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm)
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(set_time, tm);
+	status = efi_queue_work(SET_TIME, tm, NULL, NULL, NULL, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -120,7 +287,8 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(get_wakeup_time, enabled, pending, tm);
+	status = efi_queue_work(GET_WAKEUP_TIME, enabled, pending, tm, NULL,
+				NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -131,7 +299,8 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(set_wakeup_time, enabled, tm);
+	status = efi_queue_work(SET_WAKEUP_TIME, &enabled, tm, NULL, NULL,
+				NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -146,8 +315,8 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(get_variable, name, vendor, attr, data_size,
-			       data);
+	status = efi_queue_work(GET_VARIABLE, name, vendor, attr, data_size,
+				data);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -160,7 +329,8 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(get_next_variable, name_size, name, vendor);
+	status = efi_queue_work(GET_NEXT_VARIABLE, name_size, name, vendor,
+				NULL, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -175,8 +345,8 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(set_variable, name, vendor, attr, data_size,
-			       data);
+	status = efi_queue_work(SET_VARIABLE, name, vendor, &attr, &data_size,
+				data);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -210,8 +380,8 @@ static efi_status_t virt_efi_query_variable_info(u32 attr,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(query_variable_info, attr, storage_space,
-			       remaining_space, max_variable_size);
+	status = efi_queue_work(QUERY_VARIABLE_INFO, &attr, storage_space,
+				remaining_space, max_variable_size, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -242,7 +412,8 @@ static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(get_next_high_mono_count, count);
+	status = efi_queue_work(GET_NEXT_HIGH_MONO_COUNT, count, NULL, NULL,
+				NULL, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -272,7 +443,8 @@ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(update_capsule, capsules, count, sg_list);
+	status = efi_queue_work(UPDATE_CAPSULE, capsules, &count, &sg_list,
+				NULL, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
@@ -289,8 +461,8 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
 
 	if (down_interruptible(&efi_runtime_lock))
 		return EFI_ABORTED;
-	status = efi_call_virt(query_capsule_caps, capsules, count, max_size,
-			       reset_type);
+	status = efi_queue_work(QUERY_CAPSULE_CAPS, capsules, &count,
+				max_size, reset_type, NULL);
 	up(&efi_runtime_lock);
 	return status;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 56add823f190..8ba0cdd244b2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1651,4 +1651,7 @@ struct linux_efi_tpm_eventlog {
 
 extern int efi_tpm_eventlog_init(void);
 
+/* Workqueue to queue EFI Runtime Services */
+extern struct workqueue_struct *efi_rts_wq;
+
 #endif /* _LINUX_EFI_H */
-- 
cgit v1.2.3


From f5dcc214aae29a68b37b2b4183f7171724e7b02d Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Wed, 11 Jul 2018 11:40:37 +0200
Subject: efi: Remove the declaration of efi_late_init() as the function is
 unused

The following commit:

  7b0a911478c74 ("efi/x86: Move the EFI BGRT init code to early init code")

... removed the implementation and all the references to
efi_late_init() but the function is still declared at
include/linux/efi.h.

Hence, remove the unnecessary declaration.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180711094040.12506-6-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8ba0cdd244b2..e190652f5ef9 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -988,14 +988,12 @@ extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
 extern void efi_gettimeofday (struct timespec64 *ts);
 extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if possible */
 #ifdef CONFIG_X86
-extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
 extern efi_status_t efi_query_variable_store(u32 attributes,
 					     unsigned long size,
 					     bool nonblocking);
 extern void efi_find_mirror(void);
 #else
-static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
 
 static inline efi_status_t efi_query_variable_store(u32 attributes,
-- 
cgit v1.2.3


From 784abe24c903b093af04cf1a043140faa556cad7 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Fri, 13 Jul 2018 14:33:35 +0300
Subject: net: Add decrypted field to skb

The decrypted bit is propogated to cloned/copied skbs.
This will be used later by the inline crypto receive side offload
of tls.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 7 ++++++-
 net/core/skbuff.c      | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7601838c2513..3ceb8dcc54da 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
+ *	@decrypted: Decrypted SKB
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -736,7 +737,11 @@ struct sk_buff {
 				peeked:1,
 				head_frag:1,
 				xmit_more:1,
-				__unused:1; /* one bit hole */
+#ifdef CONFIG_TLS_DEVICE
+				decrypted:1;
+#else
+				__unused:1;
+#endif
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c4e24ac27464..cfd6c6f35f9c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -805,6 +805,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	 * It is not yet because we do not want to have a 16 bit hole
 	 */
 	new->queue_mapping = old->queue_mapping;
+#ifdef CONFIG_TLS_DEVICE
+	new->decrypted = old->decrypted;
+#endif
 
 	memcpy(&new->headers_start, &old->headers_start,
 	       offsetof(struct sk_buff, headers_end) -
@@ -865,6 +868,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(head_frag);
 	C(data);
 	C(truesize);
+#ifdef CONFIG_TLS_DEVICE
+	C(decrypted);
+#endif
 	refcount_set(&n->users, 1);
 
 	atomic_inc(&(skb_shinfo(skb)->dataref));
-- 
cgit v1.2.3


From 14136564c8ee94566945e85014019cbdb1716dca Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Fri, 13 Jul 2018 14:33:36 +0300
Subject: net: Add TLS RX offload feature

This patch adds a netdev feature to configure TLS RX inline crypto offload.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 2 ++
 net/core/ethtool.c              | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 623bb8ced060..2b2a6dce1630 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -79,6 +79,7 @@ enum {
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
 	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
+	NETIF_F_HW_TLS_RX_BIT,		/* Hardware TLS RX offload */
 
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
@@ -151,6 +152,7 @@ enum {
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
 #define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
+#define NETIF_F_HW_TLS_RX	__NETIF_F(HW_TLS_RX)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e677a20180cf..c9993c6c2fd4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
 	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
+	[NETIF_F_HW_TLS_RX_BIT] =	 "tls-hw-rx-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From 16e4edc297ffc9b643b8dd3da6b0d579753ea2b3 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Fri, 13 Jul 2018 14:33:37 +0300
Subject: net: Add TLS rx resync NDO

Add new netdev tls op for resynchronizing HW tls context

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4fa7f7a3f8b3..3514d67112b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -903,6 +903,8 @@ struct tlsdev_ops {
 	void (*tls_dev_del)(struct net_device *netdev,
 			    struct tls_context *ctx,
 			    enum tls_offload_ctx_dir direction);
+	void (*tls_dev_resync_rx)(struct net_device *netdev,
+				  struct sock *sk, u32 seq, u64 rcd_sn);
 };
 #endif
 
-- 
cgit v1.2.3


From ab412e1dd7db132c2abeb9385b4bf0dc8e6c5a65 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Fri, 13 Jul 2018 14:33:46 +0300
Subject: net/mlx5: Accel, add TLS rx offload routines

In Innova TLS, TLS contexts are added or deleted
via a command message over the SBU connection.
The HW then sends a response message over the same connection.

Complete the implementation for Innova TLS (FPGA-based) hardware by
adding support for rx inline crypto offload.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  23 +++--
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  26 +++--
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 113 ++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h |  18 ++--
 include/linux/mlx5/mlx5_ifc_fpga.h                 |   1 +
 5 files changed, 135 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
index 77ac19f38cbe..da7bd26368f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -37,17 +37,26 @@
 #include "mlx5_core.h"
 #include "fpga/tls.h"
 
-int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
-			       struct tls_crypto_info *crypto_info,
-			       u32 start_offload_tcp_sn, u32 *p_swid)
+int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			    struct tls_crypto_info *crypto_info,
+			    u32 start_offload_tcp_sn, u32 *p_swid,
+			    bool direction_sx)
 {
-	return mlx5_fpga_tls_add_tx_flow(mdev, flow, crypto_info,
-					 start_offload_tcp_sn, p_swid);
+	return mlx5_fpga_tls_add_flow(mdev, flow, crypto_info,
+				      start_offload_tcp_sn, p_swid,
+				      direction_sx);
 }
 
-void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid)
+void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			     bool direction_sx)
 {
-	mlx5_fpga_tls_del_tx_flow(mdev, swid, GFP_KERNEL);
+	mlx5_fpga_tls_del_flow(mdev, swid, GFP_KERNEL, direction_sx);
+}
+
+int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			     u64 rcd_sn)
+{
+	return mlx5_fpga_tls_resync_rx(mdev, handle, seq, rcd_sn);
 }
 
 bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
index 6f9c9f446ecc..2228c1083528 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -60,10 +60,14 @@ struct mlx5_ifc_tls_flow_bits {
 	u8         reserved_at_2[0x1e];
 };
 
-int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
-			       struct tls_crypto_info *crypto_info,
-			       u32 start_offload_tcp_sn, u32 *p_swid);
-void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid);
+int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			    struct tls_crypto_info *crypto_info,
+			    u32 start_offload_tcp_sn, u32 *p_swid,
+			    bool direction_sx);
+void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			     bool direction_sx);
+int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			     u64 rcd_sn);
 bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev);
 u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev);
 int mlx5_accel_tls_init(struct mlx5_core_dev *mdev);
@@ -71,11 +75,15 @@ void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev);
 
 #else
 
-static inline int
-mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
-			   struct tls_crypto_info *crypto_info,
-			   u32 start_offload_tcp_sn, u32 *p_swid) { return 0; }
-static inline void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid) { }
+static int
+mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			struct tls_crypto_info *crypto_info,
+			u32 start_offload_tcp_sn, u32 *p_swid,
+			bool direction_sx) { return -ENOTSUPP; }
+static inline void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+					   bool direction_sx) { }
+static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle,
+					   u32 seq, u64 rcd_sn) { return 0; }
 static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; }
 static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
 static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
index c9736238604a..5cf5f2a9d51f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -129,6 +129,7 @@ static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev,
 static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock,
 				    void *ptr)
 {
+	unsigned long flags;
 	int ret;
 
 	/* TLS metadata format is 1 byte for syndrome followed
@@ -139,9 +140,9 @@ static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock,
 	BUILD_BUG_ON((SWID_END - 1) & 0xFF000000);
 
 	idr_preload(GFP_KERNEL);
-	spin_lock_irq(idr_spinlock);
+	spin_lock_irqsave(idr_spinlock, flags);
 	ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC);
-	spin_unlock_irq(idr_spinlock);
+	spin_unlock_irqrestore(idr_spinlock, flags);
 	idr_preload_end();
 
 	return ret;
@@ -157,6 +158,13 @@ static void mlx5_fpga_tls_release_swid(struct idr *idr,
 	spin_unlock_irqrestore(idr_spinlock, flags);
 }
 
+static void mlx_tls_kfree_complete(struct mlx5_fpga_conn *conn,
+				   struct mlx5_fpga_device *fdev,
+				   struct mlx5_fpga_dma_buf *buf, u8 status)
+{
+	kfree(buf);
+}
+
 struct mlx5_teardown_stream_context {
 	struct mlx5_fpga_tls_command_context cmd;
 	u32 swid;
@@ -178,9 +186,13 @@ mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn,
 			mlx5_fpga_err(fdev,
 				      "Teardown stream failed with syndrome = %d",
 				      syndrome);
-		else
+		else if (MLX5_GET(tls_cmd, cmd->buf.sg[0].data, direction_sx))
 			mlx5_fpga_tls_release_swid(&fdev->tls->tx_idr,
-						   &fdev->tls->idr_spinlock,
+						   &fdev->tls->tx_idr_spinlock,
+						   ctx->swid);
+		else
+			mlx5_fpga_tls_release_swid(&fdev->tls->rx_idr,
+						   &fdev->tls->rx_idr_spinlock,
 						   ctx->swid);
 	}
 	mlx5_fpga_tls_put_command_ctx(cmd);
@@ -196,6 +208,40 @@ static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
 		 MLX5_GET(tls_flow, flow, direction_sx));
 }
 
+int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			    u64 rcd_sn)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int size = sizeof(*buf) + MLX5_TLS_COMMAND_SIZE;
+	void *flow;
+	void *cmd;
+	int ret;
+
+	buf = kzalloc(size, GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+
+	cmd = (buf + 1);
+
+	rcu_read_lock();
+	flow = idr_find(&mdev->fpga->tls->rx_idr, ntohl(handle));
+	rcu_read_unlock();
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+
+	MLX5_SET(tls_cmd, cmd, swid, ntohl(handle));
+	MLX5_SET64(tls_cmd, cmd, tls_rcd_sn, be64_to_cpu(rcd_sn));
+	MLX5_SET(tls_cmd, cmd, tcp_sn, seq);
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_RESYNC_RX);
+
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+	buf->complete = mlx_tls_kfree_complete;
+
+	ret = mlx5_fpga_sbu_conn_sendmsg(mdev->fpga->tls->conn, buf);
+
+	return ret;
+}
+
 static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev,
 					    void *flow, u32 swid, gfp_t flags)
 {
@@ -223,14 +269,18 @@ static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev,
 			       mlx5_fpga_tls_teardown_completion);
 }
 
-void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
-			       gfp_t flags)
+void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			    gfp_t flags, bool direction_sx)
 {
 	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
 	void *flow;
 
 	rcu_read_lock();
-	flow = idr_find(&tls->tx_idr, swid);
+	if (direction_sx)
+		flow = idr_find(&tls->tx_idr, swid);
+	else
+		flow = idr_find(&tls->rx_idr, swid);
+
 	rcu_read_unlock();
 
 	if (!flow) {
@@ -289,9 +339,11 @@ mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn,
 		 * the command context because we might not have received
 		 * the tx completion yet.
 		 */
-		mlx5_fpga_tls_del_tx_flow(fdev->mdev,
-					  MLX5_GET(tls_cmd, tls_cmd, swid),
-					  GFP_ATOMIC);
+		mlx5_fpga_tls_del_flow(fdev->mdev,
+				       MLX5_GET(tls_cmd, tls_cmd, swid),
+				       GFP_ATOMIC,
+				       MLX5_GET(tls_cmd, tls_cmd,
+						direction_sx));
 	}
 
 	mlx5_fpga_tls_put_command_ctx(cmd);
@@ -415,8 +467,7 @@ int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev)
 	if (err)
 		goto error;
 
-	if (!(tls->caps & (MLX5_ACCEL_TLS_TX | MLX5_ACCEL_TLS_V12 |
-				 MLX5_ACCEL_TLS_AES_GCM128))) {
+	if (!(tls->caps & (MLX5_ACCEL_TLS_V12 | MLX5_ACCEL_TLS_AES_GCM128))) {
 		err = -ENOTSUPP;
 		goto error;
 	}
@@ -438,7 +489,9 @@ int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev)
 	INIT_LIST_HEAD(&tls->pending_cmds);
 
 	idr_init(&tls->tx_idr);
-	spin_lock_init(&tls->idr_spinlock);
+	idr_init(&tls->rx_idr);
+	spin_lock_init(&tls->tx_idr_spinlock);
+	spin_lock_init(&tls->rx_idr_spinlock);
 	fdev->tls = tls;
 	return 0;
 
@@ -500,9 +553,9 @@ static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps,
 	return 0;
 }
 
-static int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
-				  struct tls_crypto_info *crypto_info, u32 swid,
-				  u32 tcp_sn)
+static int _mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+				   struct tls_crypto_info *crypto_info,
+				   u32 swid, u32 tcp_sn)
 {
 	u32 caps = mlx5_fpga_tls_device_caps(mdev);
 	struct mlx5_setup_stream_context *ctx;
@@ -533,30 +586,42 @@ out:
 	return ret;
 }
 
-int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
-			      struct tls_crypto_info *crypto_info,
-			      u32 start_offload_tcp_sn, u32 *p_swid)
+int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid,
+			   bool direction_sx)
 {
 	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
 	int ret = -ENOMEM;
 	u32 swid;
 
-	ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, &tls->idr_spinlock, flow);
+	if (direction_sx)
+		ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr,
+					       &tls->tx_idr_spinlock, flow);
+	else
+		ret = mlx5_fpga_tls_alloc_swid(&tls->rx_idr,
+					       &tls->rx_idr_spinlock, flow);
+
 	if (ret < 0)
 		return ret;
 
 	swid = ret;
-	MLX5_SET(tls_flow, flow, direction_sx, 1);
+	MLX5_SET(tls_flow, flow, direction_sx, direction_sx ? 1 : 0);
 
-	ret = mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid,
-				     start_offload_tcp_sn);
+	ret = _mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid,
+				      start_offload_tcp_sn);
 	if (ret && ret != -EINTR)
 		goto free_swid;
 
 	*p_swid = swid;
 	return 0;
 free_swid:
-	mlx5_fpga_tls_release_swid(&tls->tx_idr, &tls->idr_spinlock, swid);
+	if (direction_sx)
+		mlx5_fpga_tls_release_swid(&tls->tx_idr,
+					   &tls->tx_idr_spinlock, swid);
+	else
+		mlx5_fpga_tls_release_swid(&tls->rx_idr,
+					   &tls->rx_idr_spinlock, swid);
 
 	return ret;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
index 800a214e4e49..3b2e37bf76fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
@@ -46,15 +46,18 @@ struct mlx5_fpga_tls {
 	struct mlx5_fpga_conn *conn;
 
 	struct idr tx_idr;
-	spinlock_t idr_spinlock; /* protects the IDR */
+	struct idr rx_idr;
+	spinlock_t tx_idr_spinlock; /* protects the IDR */
+	spinlock_t rx_idr_spinlock; /* protects the IDR */
 };
 
-int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
-			      struct tls_crypto_info *crypto_info,
-			      u32 start_offload_tcp_sn, u32 *p_swid);
+int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid,
+			   bool direction_sx);
 
-void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
-			       gfp_t flags);
+void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			    gfp_t flags, bool direction_sx);
 
 bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev);
 int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev);
@@ -65,4 +68,7 @@ static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev)
 	return mdev->fpga->tls->caps;
 }
 
+int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			    u64 rcd_sn);
+
 #endif /* __MLX5_FPGA_TLS_H__ */
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index 64d0f40d4cc3..37e065a80a43 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -576,6 +576,7 @@ struct mlx5_ifc_fpga_ipsec_sa {
 enum fpga_tls_cmds {
 	CMD_SETUP_STREAM		= 0x1001,
 	CMD_TEARDOWN_STREAM		= 0x1002,
+	CMD_RESYNC_RX			= 0x1003,
 };
 
 #define MLX5_TLS_1_2 (0)
-- 
cgit v1.2.3


From a4f9edb29d9c19f9f8dcd2df7ddfe4eb7ad58996 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 30 May 2018 17:29:52 +0100
Subject: irqchip/gic-v3: Expose GICD_TYPER in the rdist structure

Instead of exposing the GIC distributor IntID field in the rdist
structure that is passed to the ITS, let's replace it with a
copy of the whole GICD_TYPER register. We are going to need
some of this information at a later time.

No functionnal change.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 2 +-
 drivers/irqchip/irq-gic-v3.c       | 4 ++--
 include/linux/irqchip/arm-gic-v3.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 9c5b85577053..efe6d1a6c32e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1616,7 +1616,7 @@ static int __init its_alloc_lpi_tables(void)
 {
 	phys_addr_t paddr;
 
-	lpi_id_bits = gic_rdists->id_bits;
+	lpi_id_bits = GICD_TYPER_ID_BITS(gic_rdists->gicd_typer);
 	gic_rdists->prop_page = its_allocate_prop_table(GFP_NOWAIT);
 	if (!gic_rdists->prop_page) {
 		pr_err("Failed to allocate PROPBASE\n");
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 76ea56d779a1..e214181b77b7 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -877,7 +877,7 @@ static struct irq_chip gic_eoimode1_chip = {
 	.flags			= IRQCHIP_SET_TYPE_MASKED,
 };
 
-#define GIC_ID_NR		(1U << gic_data.rdists.id_bits)
+#define GIC_ID_NR	(1U << GICD_TYPER_ID_BITS(gic_data.rdists.gicd_typer))
 
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
 			      irq_hw_number_t hw)
@@ -1091,7 +1091,7 @@ static int __init gic_init_bases(void __iomem *dist_base,
 	 * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI)
 	 */
 	typer = readl_relaxed(gic_data.dist_base + GICD_TYPER);
-	gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer);
+	gic_data.rdists.gicd_typer = typer;
 	gic_irqs = GICD_TYPER_IRQS(typer);
 	if (gic_irqs > 1020)
 		gic_irqs = 1020;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index cbb872c1b607..396cd99af02f 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -576,8 +576,8 @@ struct rdists {
 		phys_addr_t	phys_base;
 	} __percpu		*rdist;
 	struct page		*prop_page;
-	int			id_bits;
 	u64			flags;
+	u32			gicd_typer;
 	bool			has_vlpis;
 	bool			has_direct_lpi;
 };
-- 
cgit v1.2.3


From 12b2905af183c931bedcab4292c81d3a415e080f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 31 May 2018 09:01:59 +0100
Subject: irqchip/gic-v3-its: Honor hypervisor enforced LPI range

A recent extension to the GIC architecture allows a hypervisor to
arbitrarily reduce the number of LPIs available to a guest, no
matter what the GIC says about the valid range of IntIDs.

Let's factor in this information when computing the number of
available LPIs

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 9 +++++++++
 include/linux/irqchip/arm-gic-v3.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index efe6d1a6c32e..f56c84977241 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1541,8 +1541,17 @@ out:
 static int __init its_lpi_init(u32 id_bits)
 {
 	u32 lpis = (1UL << id_bits) - 8192;
+	u32 numlpis;
 	int err;
 
+	numlpis = 1UL << GICD_TYPER_NUM_LPIS(gic_rdists->gicd_typer);
+
+	if (numlpis > 2 && !WARN_ON(numlpis > lpis)) {
+		lpis = numlpis;
+		pr_info("ITS: Using hypervisor restricted LPI range [%u]\n",
+			lpis);
+	}
+
 	/*
 	 * Initializing the allocator is just the same as freeing the
 	 * full range of LPIs.
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 396cd99af02f..9d2ea3e907d0 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -73,6 +73,7 @@
 #define GICD_TYPER_MBIS			(1U << 16)
 
 #define GICD_TYPER_ID_BITS(typer)	((((typer) >> 19) & 0x1f) + 1)
+#define GICD_TYPER_NUM_LPIS(typer)	((((typer) >> 11) & 0x1f) + 1)
 #define GICD_TYPER_IRQS(typer)		((((typer) & 0x1f) + 1) * 32)
 
 #define GICD_IROUTER_SPI_MODE_ONE	(0U << 31)
-- 
cgit v1.2.3


From 0f292f023ffcc67ec49d63dcb7fe388711cbb83a Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Wed, 11 Jul 2018 13:37:53 +0100
Subject: ALSA: usb-audio: Add support for Processing Units in UAC3

This patch adds support for the Processig Units defined in
the UAC3 spec. The main difference with the previous specs
is the lack of on/off switches in the controls for these
units and the addiction of the new Multi Function Processing
Unit.

The current version of the UAC3 spec doesn't define any
useful controls for the new Multi Function Processing Unit
so no control will get created once this unit is parsed.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v3.h   | 15 +++++++++++++
 include/uapi/linux/usb/audio.h | 49 ++++++++++++++++++++++++++++++++--------
 sound/usb/mixer.c              | 51 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 104 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio-v3.h b/include/linux/usb/audio-v3.h
index a710e28b5215..334bfa6dfb47 100644
--- a/include/linux/usb/audio-v3.h
+++ b/include/linux/usb/audio-v3.h
@@ -387,6 +387,12 @@ struct uac3_interrupt_data_msg {
 #define UAC3_CONNECTORS			0x0f
 #define UAC3_POWER_DOMAIN		0x10
 
+/* A.20 PROCESSING UNIT PROCESS TYPES */
+#define UAC3_PROCESS_UNDEFINED		0x00
+#define UAC3_PROCESS_UP_DOWNMIX		0x01
+#define UAC3_PROCESS_STEREO_EXTENDER	0x02
+#define UAC3_PROCESS_MULTI_FUNCTION	0x03
+
 /* A.22 AUDIO CLASS-SPECIFIC REQUEST CODES */
 /* see audio-v2.h for the rest, which is identical to v2 */
 #define UAC3_CS_REQ_INTEN			0x04
@@ -406,6 +412,15 @@ struct uac3_interrupt_data_msg {
 #define UAC3_TE_OVERFLOW			0x04
 #define UAC3_TE_LATENCY 			0x05
 
+/* A.23.10 PROCESSING UNITS CONTROL SELECTROS */
+
+/* Up/Down Mixer */
+#define UAC3_UD_MODE_SELECT			0x01
+
+/* Stereo Extender */
+#define UAC3_EXT_WIDTH_CONTROL			0x01
+
+
 /* BADD predefined Unit/Terminal values */
 #define UAC3_BADD_IT_ID1	1  /* Input Terminal ID1: bTerminalID = 1 */
 #define UAC3_BADD_FU_ID2	2  /* Feature Unit ID2: bUnitID = 2 */
diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 74e520fb944f..ddc5396800aa 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -390,33 +390,64 @@ static inline __u8 uac_processing_unit_iChannelNames(struct uac_processing_unit_
 static inline __u8 uac_processing_unit_bControlSize(struct uac_processing_unit_descriptor *desc,
 						    int protocol)
 {
-	return (protocol == UAC_VERSION_1) ?
-		desc->baSourceID[desc->bNrInPins + 4] :
-		2; /* in UAC2, this value is constant */
+	switch (protocol) {
+	case UAC_VERSION_1:
+		return desc->baSourceID[desc->bNrInPins + 4];
+	case UAC_VERSION_2:
+		return 2; /* in UAC2, this value is constant */
+	case UAC_VERSION_3:
+		return 4; /* in UAC3, this value is constant */
+	default:
+		return 1;
+	}
 }
 
 static inline __u8 *uac_processing_unit_bmControls(struct uac_processing_unit_descriptor *desc,
 						   int protocol)
 {
-	return (protocol == UAC_VERSION_1) ?
-		&desc->baSourceID[desc->bNrInPins + 5] :
-		&desc->baSourceID[desc->bNrInPins + 6];
+	switch (protocol) {
+	case UAC_VERSION_1:
+		return &desc->baSourceID[desc->bNrInPins + 5];
+	case UAC_VERSION_2:
+		return &desc->baSourceID[desc->bNrInPins + 6];
+	case UAC_VERSION_3:
+		return &desc->baSourceID[desc->bNrInPins + 2];
+	default:
+		return NULL;
+	}
 }
 
 static inline __u8 uac_processing_unit_iProcessing(struct uac_processing_unit_descriptor *desc,
 						   int protocol)
 {
 	__u8 control_size = uac_processing_unit_bControlSize(desc, protocol);
-	return *(uac_processing_unit_bmControls(desc, protocol)
-			+ control_size);
+
+	switch (protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return *(uac_processing_unit_bmControls(desc, protocol)
+			 + control_size);
+	case UAC_VERSION_3:
+		return 0; /* UAC3 does not have this field */
+	}
 }
 
 static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_descriptor *desc,
 						 int protocol)
 {
 	__u8 control_size = uac_processing_unit_bControlSize(desc, protocol);
-	return uac_processing_unit_bmControls(desc, protocol)
+
+	switch (protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return uac_processing_unit_bmControls(desc, protocol)
 			+ control_size + 1;
+	case UAC_VERSION_3:
+		return uac_processing_unit_bmControls(desc, protocol)
+			+ control_size;
+	}
 }
 
 /* 4.5.2 Class-Specific AS Interface Descriptor */
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index bfb3484096a6..39fde49e8749 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -953,6 +953,23 @@ static int check_input_term(struct mixer_build *state, int id,
 
 				return 0;
 			}
+			case UAC3_PROCESSING_UNIT: {
+				struct uac_processing_unit_descriptor *d = p1;
+
+				if (!d->bNrInPins)
+					return -EINVAL;
+
+				/* call recursively to retrieve the channel info */
+				err = check_input_term(state, d->baSourceID[0], term);
+				if (err < 0)
+					return err;
+
+				term->type = d->bDescriptorSubtype << 16; /* virtual type */
+				term->id = id;
+				term->name = 0; /* TODO: UAC3 Class-specific strings */
+
+				return 0;
+			}
 			default:
 				return -ENODEV;
 			}
@@ -2180,6 +2197,11 @@ struct procunit_info {
 	struct procunit_value_info *values;
 };
 
+static struct procunit_value_info undefined_proc_info[] = {
+	{ 0x00, "Control Undefined", 0 },
+	{ 0 }
+};
+
 static struct procunit_value_info updown_proc_info[] = {
 	{ UAC_UD_ENABLE, "Switch", USB_MIXER_BOOLEAN },
 	{ UAC_UD_MODE_SELECT, "Mode Select", USB_MIXER_U8, 1 },
@@ -2228,6 +2250,23 @@ static struct procunit_info procunits[] = {
 	{ UAC_PROCESS_DYN_RANGE_COMP, "DCR", dcr_proc_info },
 	{ 0 },
 };
+
+static struct procunit_value_info uac3_updown_proc_info[] = {
+	{ UAC3_UD_MODE_SELECT, "Mode Select", USB_MIXER_U8, 1 },
+	{ 0 }
+};
+static struct procunit_value_info uac3_stereo_ext_proc_info[] = {
+	{ UAC3_EXT_WIDTH_CONTROL, "Width Control", USB_MIXER_U8 },
+	{ 0 }
+};
+
+static struct procunit_info uac3_procunits[] = {
+	{ UAC3_PROCESS_UP_DOWNMIX, "Up Down", uac3_updown_proc_info },
+	{ UAC3_PROCESS_STEREO_EXTENDER, "3D Stereo Extender", uac3_stereo_ext_proc_info },
+	{ UAC3_PROCESS_MULTI_FUNCTION, "Multi-Function", undefined_proc_info },
+	{ 0 },
+};
+
 /*
  * predefined data for extension units
  */
@@ -2388,8 +2427,16 @@ static int build_audio_procunit(struct mixer_build *state, int unitid,
 static int parse_audio_processing_unit(struct mixer_build *state, int unitid,
 				       void *raw_desc)
 {
-	return build_audio_procunit(state, unitid, raw_desc,
-				    procunits, "Processing Unit");
+	switch (state->mixer->protocol) {
+	case UAC_VERSION_1:
+	case UAC_VERSION_2:
+	default:
+		return build_audio_procunit(state, unitid, raw_desc,
+				procunits, "Processing Unit");
+	case UAC_VERSION_3:
+		return build_audio_procunit(state, unitid, raw_desc,
+				uac3_procunits, "Processing Unit");
+	}
 }
 
 static int parse_audio_extension_unit(struct mixer_build *state, int unitid,
-- 
cgit v1.2.3


From d7e5a9a50245b91f016c814b0f076f7e55cbb980 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Jun 2018 17:49:43 +0200
Subject: netfilter: utils: move nf_ip_checksum* from ipv4 to utils

allows to make nf_ip_checksum_partial static, it no longer
has an external caller.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4.h | 11 ---------
 net/ipv4/netfilter.c           | 53 ----------------------------------------
 net/netfilter/utils.c          | 55 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index b31dabfdb453..95ab5cc64422 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -23,9 +23,6 @@ struct nf_queue_entry;
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 		       unsigned int dataoff, u_int8_t protocol);
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
-			       unsigned int dataoff, unsigned int len,
-			       u_int8_t protocol);
 int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		bool strict);
 int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry);
@@ -35,14 +32,6 @@ static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 {
 	return 0;
 }
-static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb,
-					     unsigned int hook,
-					     unsigned int dataoff,
-					     unsigned int len,
-					     u_int8_t protocol)
-{
-	return 0;
-}
 static inline int nf_ip_route(struct net *net, struct dst_entry **dst,
 			      struct flowi *fl, bool strict)
 {
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
 }
 EXPORT_SYMBOL_GPL(nf_ip_reroute);
 
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
-			    unsigned int dataoff, u_int8_t protocol)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
-			break;
-		if ((protocol == 0 && !csum_fold(skb->csum)) ||
-		    !csum_tcpudp_magic(iph->saddr, iph->daddr,
-				       skb->len - dataoff, protocol,
-				       skb->csum)) {
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			break;
-		}
-		/* fall through */
-	case CHECKSUM_NONE:
-		if (protocol == 0)
-			skb->csum = 0;
-		else
-			skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
-						       skb->len - dataoff,
-						       protocol, 0);
-		csum = __skb_checksum_complete(skb);
-	}
-	return csum;
-}
-EXPORT_SYMBOL(nf_ip_checksum);
-
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
-			       unsigned int dataoff, unsigned int len,
-			       u_int8_t protocol)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (len == skb->len - dataoff)
-			return nf_ip_checksum(skb, hook, dataoff, protocol);
-		/* fall through */
-	case CHECKSUM_NONE:
-		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
-					       skb->len - dataoff, 0);
-		skb->ip_summed = CHECKSUM_NONE;
-		return __skb_checksum_complete_head(skb, dataoff + len);
-	}
-	return csum;
-}
-EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-
 int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		bool strict __always_unused)
 {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 0b660c568156..8980c8a0fe5c 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -1,9 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_queue.h>
 
+#ifdef CONFIG_INET
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+		       unsigned int dataoff, u8 protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if ((protocol == 0 && !csum_fold(skb->csum)) ||
+		    !csum_tcpudp_magic(iph->saddr, iph->daddr,
+				       skb->len - dataoff, protocol,
+				       skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		if (protocol == 0)
+			skb->csum = 0;
+		else
+			skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						       skb->len - dataoff,
+						       protocol, 0);
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+#endif
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				      unsigned int dataoff, unsigned int len,
+				      u8 protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+					       skb->len - dataoff, 0);
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+}
+
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		    unsigned int dataoff, u_int8_t protocol,
 		    unsigned short family)
-- 
cgit v1.2.3


From ebee5a50d0b7cdc576aa8081f05b86971880054d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Jun 2018 17:49:59 +0200
Subject: netfilter: utils: move nf_ip6_checksum* from ipv6 to utils

similar to previous change, this also allows to remove it
from nf_ipv6_ops and avoid the indirection.

It also removes the bogus dependency of nf_conntrack_ipv6 on ipv6 module:
ipv6 checksum functions are built into kernel even if CONFIG_IPV6=m,
but ipv6/netfilter.o isn't.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |  5 ---
 net/ipv6/netfilter.c           | 62 ----------------------------------
 net/netfilter/utils.c          | 76 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 65 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 288c597e75b3..c0dc4dd78887 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -30,11 +30,6 @@ struct nf_ipv6_ops {
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
-	__sum16 (*checksum)(struct sk_buff *skb, unsigned int hook,
-			    unsigned int dataoff, u_int8_t protocol);
-	__sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook,
-				    unsigned int dataoff, unsigned int len,
-				    u_int8_t protocol);
 	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		     bool strict);
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 531d6957af36..5ae8e1c51079 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,7 +15,6 @@
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
 #include <net/xfrm.h>
-#include <net/ip6_checksum.h>
 #include <net/netfilter/nf_queue.h>
 
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
@@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
 	return err;
 }
 
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
-			     unsigned int dataoff, u_int8_t protocol)
-{
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
-			break;
-		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
-				     skb->len - dataoff, protocol,
-				     csum_sub(skb->csum,
-					      skb_checksum(skb, 0,
-							   dataoff, 0)))) {
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			break;
-		}
-		/* fall through */
-	case CHECKSUM_NONE:
-		skb->csum = ~csum_unfold(
-				csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
-					     skb->len - dataoff,
-					     protocol,
-					     csum_sub(0,
-						      skb_checksum(skb, 0,
-								   dataoff, 0))));
-		csum = __skb_checksum_complete(skb);
-	}
-	return csum;
-}
-EXPORT_SYMBOL(nf_ip6_checksum);
-
-static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
-				       unsigned int dataoff, unsigned int len,
-				       u_int8_t protocol)
-{
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	__wsum hsum;
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (len == skb->len - dataoff)
-			return nf_ip6_checksum(skb, hook, dataoff, protocol);
-		/* fall through */
-	case CHECKSUM_NONE:
-		hsum = skb_checksum(skb, 0, dataoff, 0);
-		skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
-							 &ip6h->daddr,
-							 skb->len - dataoff,
-							 protocol,
-							 csum_sub(0, hsum)));
-		skb->ip_summed = CHECKSUM_NONE;
-		return __skb_checksum_complete_head(skb, dataoff + len);
-	}
-	return csum;
-};
-
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr		= ipv6_chk_addr,
 	.route_input    	= ip6_route_input,
 	.fragment		= ip6_fragment,
-	.checksum		= nf_ip6_checksum,
-	.checksum_partial	= nf_ip6_checksum_partial,
 	.route			= nf_ip6_route,
 	.reroute		= nf_ip6_reroute,
 };
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 8980c8a0fe5c..e8da9a9bba73 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -4,6 +4,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/ip6_checksum.h>
 
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
@@ -59,11 +60,69 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 	return csum;
 }
 
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+			unsigned int dataoff, u8 protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				     skb->len - dataoff, protocol,
+				     csum_sub(skb->csum,
+					      skb_checksum(skb, 0,
+							   dataoff, 0)))) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = ~csum_unfold(
+				csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					     skb->len - dataoff,
+					     protocol,
+					     csum_sub(0,
+						      skb_checksum(skb, 0,
+								   dataoff, 0))));
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				       unsigned int dataoff, unsigned int len,
+				       u8 protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__wsum hsum;
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip6_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		hsum = skb_checksum(skb, 0, dataoff, 0);
+		skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+							 &ip6h->daddr,
+							 skb->len - dataoff,
+							 protocol,
+							 csum_sub(0, hsum)));
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+};
+
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
-		    unsigned int dataoff, u_int8_t protocol,
+		    unsigned int dataoff, u8 protocol,
 		    unsigned short family)
 {
-	const struct nf_ipv6_ops *v6ops;
 	__sum16 csum = 0;
 
 	switch (family) {
@@ -71,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		csum = nf_ip_checksum(skb, hook, dataoff, protocol);
 		break;
 	case AF_INET6:
-		v6ops = rcu_dereference(nf_ipv6_ops);
-		if (v6ops)
-			csum = v6ops->checksum(skb, hook, dataoff, protocol);
+		csum = nf_ip6_checksum(skb, hook, dataoff, protocol);
 		break;
 	}
 
@@ -83,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum);
 
 __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			    unsigned int dataoff, unsigned int len,
-			    u_int8_t protocol, unsigned short family)
+			    u8 protocol, unsigned short family)
 {
-	const struct nf_ipv6_ops *v6ops;
 	__sum16 csum = 0;
 
 	switch (family) {
@@ -94,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 					      protocol);
 		break;
 	case AF_INET6:
-		v6ops = rcu_dereference(nf_ipv6_ops);
-		if (v6ops)
-			csum = v6ops->checksum_partial(skb, hook, dataoff, len,
-						       protocol);
+		csum = nf_ip6_checksum_partial(skb, hook, dataoff, len,
+					       protocol);
 		break;
 	}
 
-- 
cgit v1.2.3


From 377179cd28cd417dcfb4396edb824533431e607e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 13 Jul 2018 14:05:56 -0400
Subject: security: define new LSM hook named security_kernel_load_data

Differentiate between the kernel reading a file specified by userspace
from the kernel loading a buffer containing data provided by userspace.
This patch defines a new LSM hook named security_kernel_load_data().

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.morris@microsoft.com>
---
 include/linux/lsm_hooks.h |  6 ++++++
 include/linux/security.h  | 27 +++++++++++++++++++++++++++
 security/security.c       |  5 +++++
 3 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 8f1131c8dd54..a08bc2587b96 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -576,6 +576,10 @@
  *	userspace to load a kernel module with the given name.
  *	@kmod_name name of the module requested by the kernel
  *	Return 0 if successful.
+ * @kernel_load_data:
+ *	Load data provided by userspace.
+ *	@id kernel load data identifier
+ *	Return 0 if permission is granted.
  * @kernel_read_file:
  *	Read a file specified by userspace.
  *	@file contains the file structure pointing to the file being read
@@ -1582,6 +1586,7 @@ union security_list_options {
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*kernel_module_request)(char *kmod_name);
+	int (*kernel_load_data)(enum kernel_load_data_id id);
 	int (*kernel_read_file)(struct file *file, enum kernel_read_file_id id);
 	int (*kernel_post_read_file)(struct file *file, char *buf, loff_t size,
 				     enum kernel_read_file_id id);
@@ -1872,6 +1877,7 @@ struct security_hook_heads {
 	struct hlist_head cred_getsecid;
 	struct hlist_head kernel_act_as;
 	struct hlist_head kernel_create_files_as;
+	struct hlist_head kernel_load_data;
 	struct hlist_head kernel_read_file;
 	struct hlist_head kernel_post_read_file;
 	struct hlist_head kernel_module_request;
diff --git a/include/linux/security.h b/include/linux/security.h
index 63030c85ee19..3410acfe139c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -159,6 +159,27 @@ extern int mmap_min_addr_handler(struct ctl_table *table, int write,
 typedef int (*initxattrs) (struct inode *inode,
 			   const struct xattr *xattr_array, void *fs_data);
 
+
+/* Keep the kernel_load_data_id enum in sync with kernel_read_file_id */
+#define __data_id_enumify(ENUM, dummy) LOADING_ ## ENUM,
+#define __data_id_stringify(dummy, str) #str,
+
+enum kernel_load_data_id {
+	__kernel_read_file_id(__data_id_enumify)
+};
+
+static const char * const kernel_load_data_str[] = {
+	__kernel_read_file_id(__data_id_stringify)
+};
+
+static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id)
+{
+	if ((unsigned)id >= LOADING_MAX_ID)
+		return kernel_load_data_str[LOADING_UNKNOWN];
+
+	return kernel_load_data_str[id];
+}
+
 #ifdef CONFIG_SECURITY
 
 struct security_mnt_opts {
@@ -320,6 +341,7 @@ void security_cred_getsecid(const struct cred *c, u32 *secid);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(char *kmod_name);
+int security_kernel_load_data(enum kernel_load_data_id id);
 int security_kernel_read_file(struct file *file, enum kernel_read_file_id id);
 int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
 				   enum kernel_read_file_id id);
@@ -909,6 +931,11 @@ static inline int security_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
+static inline int security_kernel_load_data(enum kernel_load_data_id id)
+{
+	return 0;
+}
+
 static inline int security_kernel_read_file(struct file *file,
 					    enum kernel_read_file_id id)
 {
diff --git a/security/security.c b/security/security.c
index 68f46d849abe..c2de2f134854 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1056,6 +1056,11 @@ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
 }
 EXPORT_SYMBOL_GPL(security_kernel_post_read_file);
 
+int security_kernel_load_data(enum kernel_load_data_id id)
+{
+	return call_int_hook(kernel_load_data, 0, id);
+}
+
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags)
 {
-- 
cgit v1.2.3


From 16c267aac86b463b1fcccd43c89f4c8e5c5c86fa Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 13 Jul 2018 14:05:58 -0400
Subject: ima: based on policy require signed kexec kernel images

The original kexec_load syscall can not verify file signatures, nor can
the kexec image be measured.  Based on policy, deny the kexec_load
syscall.

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.morris@microsoft.com>
---
 include/linux/ima.h                 |  7 +++++++
 security/integrity/ima/ima.h        |  1 +
 security/integrity/ima/ima_main.c   | 27 +++++++++++++++++++++++++++
 security/integrity/ima/ima_policy.c |  2 ++
 security/security.c                 |  7 ++++++-
 5 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0e4647e0eb60..84806b54b50a 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -11,6 +11,7 @@
 #define _LINUX_IMA_H
 
 #include <linux/fs.h>
+#include <linux/security.h>
 #include <linux/kexec.h>
 struct linux_binprm;
 
@@ -19,6 +20,7 @@ extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask, int opened);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
+extern int ima_load_data(enum kernel_load_data_id id);
 extern int ima_read_file(struct file *file, enum kernel_read_file_id id);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
@@ -49,6 +51,11 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
+static inline int ima_load_data(enum kernel_load_data_id id)
+{
+	return 0;
+}
+
 static inline int ima_read_file(struct file *file, enum kernel_read_file_id id)
 {
 	return 0;
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 354bb5716ce3..78c15264b17b 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -232,6 +232,7 @@ int ima_policy_show(struct seq_file *m, void *v);
 #define IMA_APPRAISE_MODULES	0x08
 #define IMA_APPRAISE_FIRMWARE	0x10
 #define IMA_APPRAISE_POLICY	0x20
+#define IMA_APPRAISE_KEXEC	0x40
 
 #ifdef CONFIG_IMA_APPRAISE
 int ima_appraise_measurement(enum ima_hooks func,
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index dca44cf7838e..71fecfef0939 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -496,6 +496,33 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size,
 				   MAY_READ, func, 0);
 }
 
+/**
+ * ima_load_data - appraise decision based on policy
+ * @id: kernel load data caller identifier
+ *
+ * Callers of this LSM hook can not measure, appraise, or audit the
+ * data provided by userspace.  Enforce policy rules requring a file
+ * signature (eg. kexec'ed kernel image).
+ *
+ * For permission return 0, otherwise return -EACCES.
+ */
+int ima_load_data(enum kernel_load_data_id id)
+{
+	if ((ima_appraise & IMA_APPRAISE_ENFORCE) != IMA_APPRAISE_ENFORCE)
+		return 0;
+
+	switch (id) {
+	case LOADING_KEXEC_IMAGE:
+		if (ima_appraise & IMA_APPRAISE_KEXEC) {
+			pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
+			return -EACCES;	/* INTEGRITY_UNKNOWN */
+		}
+	default:
+		break;
+	}
+	return 0;
+}
+
 static int __init init_ima(void)
 {
 	int error;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index cdcc9a7b4e24..d5b4958decc5 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -448,6 +448,8 @@ static int ima_appraise_flag(enum ima_hooks func)
 		return IMA_APPRAISE_FIRMWARE;
 	else if (func == POLICY_CHECK)
 		return IMA_APPRAISE_POLICY;
+	else if (func == KEXEC_KERNEL_CHECK)
+		return IMA_APPRAISE_KEXEC;
 	return 0;
 }
 
diff --git a/security/security.c b/security/security.c
index c2de2f134854..4927e7cc7d96 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1058,7 +1058,12 @@ EXPORT_SYMBOL_GPL(security_kernel_post_read_file);
 
 int security_kernel_load_data(enum kernel_load_data_id id)
 {
-	return call_int_hook(kernel_load_data, 0, id);
+	int ret;
+
+	ret = call_int_hook(kernel_load_data, 0, id);
+	if (ret)
+		return ret;
+	return ima_load_data(id);
 }
 
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
-- 
cgit v1.2.3


From 2b9672ddb6f347467d7b33b86c5dfc4d5c0501a8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 12 Jul 2018 21:32:53 +0200
Subject: net: phy: add phy_speed_down and phy_speed_up

Some network drivers include functionality to speed down the PHY when
suspending and just waiting for a WoL packet because this saves energy.
This functionality is quite generic, therefore let's factor it out to
phylib.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h   |  2 ++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c4aa360dedff..d2baedc4ea91 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -551,6 +551,84 @@ int phy_start_aneg(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_start_aneg);
 
+static int phy_poll_aneg_done(struct phy_device *phydev)
+{
+	unsigned int retries = 100;
+	int ret;
+
+	do {
+		msleep(100);
+		ret = phy_aneg_done(phydev);
+	} while (!ret && --retries);
+
+	if (!ret)
+		return -ETIMEDOUT;
+
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * phy_speed_down - set speed to lowest speed supported by both link partners
+ * @phydev: the phy_device struct
+ * @sync: perform action synchronously
+ *
+ * Description: Typically used to save energy when waiting for a WoL packet
+ *
+ * WARNING: Setting sync to false may cause the system being unable to suspend
+ * in case the PHY generates an interrupt when finishing the autonegotiation.
+ * This interrupt may wake up the system immediately after suspend.
+ * Therefore use sync = false only if you're sure it's safe with the respective
+ * network chip.
+ */
+int phy_speed_down(struct phy_device *phydev, bool sync)
+{
+	u32 adv = phydev->lp_advertising & phydev->supported;
+	u32 adv_old = phydev->advertising;
+	int ret;
+
+	if (phydev->autoneg != AUTONEG_ENABLE)
+		return 0;
+
+	if (adv & PHY_10BT_FEATURES)
+		phydev->advertising &= ~(PHY_100BT_FEATURES |
+					 PHY_1000BT_FEATURES);
+	else if (adv & PHY_100BT_FEATURES)
+		phydev->advertising &= ~PHY_1000BT_FEATURES;
+
+	if (phydev->advertising == adv_old)
+		return 0;
+
+	ret = phy_config_aneg(phydev);
+	if (ret)
+		return ret;
+
+	return sync ? phy_poll_aneg_done(phydev) : 0;
+}
+EXPORT_SYMBOL_GPL(phy_speed_down);
+
+/**
+ * phy_speed_up - (re)set advertised speeds to all supported speeds
+ * @phydev: the phy_device struct
+ *
+ * Description: Used to revert the effect of phy_speed_down
+ */
+int phy_speed_up(struct phy_device *phydev)
+{
+	u32 mask = PHY_10BT_FEATURES | PHY_100BT_FEATURES | PHY_1000BT_FEATURES;
+	u32 adv_old = phydev->advertising;
+
+	if (phydev->autoneg != AUTONEG_ENABLE)
+		return 0;
+
+	phydev->advertising = (adv_old & ~mask) | (phydev->supported & mask);
+
+	if (phydev->advertising == adv_old)
+		return 0;
+
+	return phy_config_aneg(phydev);
+}
+EXPORT_SYMBOL_GPL(phy_speed_up);
+
 /**
  * phy_start_machine - start PHY state machine tracking
  * @phydev: the phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6cd09098427c..075c2f770d3e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -942,6 +942,8 @@ void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
+int phy_speed_down(struct phy_device *phydev, bool sync);
+int phy_speed_up(struct phy_device *phydev);
 
 int phy_stop_interrupts(struct phy_device *phydev);
 int phy_restart_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3


From d9f37d01e294e5338aa3e9d3b2eda61b59b619df Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 13 Jul 2018 14:41:36 +0800
Subject: net: convert gro_count to bitmask

gro_hash size is 192 bytes, and uses 3 cache lines, if there is few
flows, gro_hash may be not fully used, so it is unnecessary to iterate
all gro_hash in napi_gro_flush(), to occupy unnecessary cacheline.

convert gro_count to a bitmask, and rename it as gro_bitmask, each bit
represents a element of gro_hash, only flush a gro_hash element if the
related bit is set, to speed up napi_gro_flush().

and update gro_bitmask only if it will be changed, to reduce cache
update

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 +++++++--
 net/core/dev.c            | 36 ++++++++++++++++++++++++------------
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3514d67112b3..c1295c7a452e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,9 +308,14 @@ struct gro_list {
 };
 
 /*
- * Structure for NAPI scheduling similar to tasklet but with weighting
+ * size of gro hash buckets, must less than bit number of
+ * napi_struct::gro_bitmask
  */
 #define GRO_HASH_BUCKETS	8
+
+/*
+ * Structure for NAPI scheduling similar to tasklet but with weighting
+ */
 struct napi_struct {
 	/* The poll_list must only be managed by the entity which
 	 * changes the state of the NAPI_STATE_SCHED bit.  This means
@@ -322,7 +327,7 @@ struct napi_struct {
 
 	unsigned long		state;
 	int			weight;
-	unsigned int		gro_count;
+	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
 	int			poll_owner;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0df1771a12f9..c883b17ee0fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5282,9 +5282,11 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 		list_del(&skb->list);
 		skb->next = NULL;
 		napi_gro_complete(skb);
-		napi->gro_count--;
 		napi->gro_hash[index].count--;
 	}
+
+	if (!napi->gro_hash[index].count)
+		__clear_bit(index, &napi->gro_bitmask);
 }
 
 /* napi->gro_hash[].list contains packets ordered by age.
@@ -5295,8 +5297,10 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
 	u32 i;
 
-	for (i = 0; i < GRO_HASH_BUCKETS; i++)
-		__napi_gro_flush_chain(napi, i, flush_old);
+	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+		if (test_bit(i, &napi->gro_bitmask))
+			__napi_gro_flush_chain(napi, i, flush_old);
+	}
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
@@ -5388,8 +5392,8 @@ static void gro_flush_oldest(struct list_head *head)
 	if (WARN_ON_ONCE(!oldest))
 		return;
 
-	/* Do not adjust napi->gro_count, caller is adding a new SKB to
-	 * the chain.
+	/* Do not adjust napi->gro_hash[].count, caller is adding a new
+	 * SKB to the chain.
 	 */
 	list_del(&oldest->list);
 	napi_gro_complete(oldest);
@@ -5464,7 +5468,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		list_del(&pp->list);
 		pp->next = NULL;
 		napi_gro_complete(pp);
-		napi->gro_count--;
 		napi->gro_hash[hash].count--;
 	}
 
@@ -5477,7 +5480,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 		gro_flush_oldest(gro_head);
 	} else {
-		napi->gro_count++;
 		napi->gro_hash[hash].count++;
 	}
 	NAPI_GRO_CB(skb)->count = 1;
@@ -5492,6 +5494,13 @@ pull:
 	if (grow > 0)
 		gro_pull_from_frag0(skb, grow);
 ok:
+	if (napi->gro_hash[hash].count) {
+		if (!test_bit(hash, &napi->gro_bitmask))
+			__set_bit(hash, &napi->gro_bitmask);
+	} else if (test_bit(hash, &napi->gro_bitmask)) {
+		__clear_bit(hash, &napi->gro_bitmask);
+	}
+
 	return ret;
 
 normal:
@@ -5890,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 				 NAPIF_STATE_IN_BUSY_POLL)))
 		return false;
 
-	if (n->gro_count) {
+	if (n->gro_bitmask) {
 		unsigned long timeout = 0;
 
 		if (work_done)
@@ -6099,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 	 */
-	if (napi->gro_count && !napi_disable_pending(napi) &&
+	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 		__napi_schedule_irqoff(napi);
 
@@ -6114,7 +6123,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	INIT_LIST_HEAD(&napi->poll_list);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi->gro_count = 0;
+	napi->gro_bitmask = 0;
 	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 		napi->gro_hash[i].count = 0;
@@ -6174,7 +6183,7 @@ void netif_napi_del(struct napi_struct *napi)
 	napi_free_frags(napi);
 
 	flush_gro_hash(napi);
-	napi->gro_count = 0;
+	napi->gro_bitmask = 0;
 }
 EXPORT_SYMBOL(netif_napi_del);
 
@@ -6216,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 		goto out_unlock;
 	}
 
-	if (n->gro_count) {
+	if (n->gro_bitmask) {
 		/* flush too old packets
 		 * If HZ < 1000, flush all packets.
 		 */
@@ -9272,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
 /* Initialize per network namespace state */
 static int __net_init netdev_init(struct net *net)
 {
+	BUILD_BUG_ON(GRO_HASH_BUCKETS >
+			FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
 	if (net != &init_net)
 		INIT_LIST_HEAD(&net->dev_base_head);
 
-- 
cgit v1.2.3


From 3d85b2703783636366560c94842affd8608ec9d1 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 16 Jul 2018 11:06:02 -0700
Subject: locking/spinlock, sched/core: Clarify requirements for
 smp_mb__after_spinlock()

There are 11 interpretations of the requirements described in the header
comment for smp_mb__after_spinlock(): one for each LKMM maintainer, and
one currently encoded in the Cat file. Stick to the latter (until a more
satisfactory solution is available).

This also reworks some snippets related to the barrier to illustrate the
requirements and to link them to the idioms which are relied upon at its
call sites.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: akiyks@gmail.com
Cc: dhowells@redhat.com
Cc: j.alglave@ucl.ac.uk
Cc: linux-arch@vger.kernel.org
Cc: luc.maranget@inria.fr
Cc: npiggin@gmail.com
Cc: parri.andrea@gmail.com
Cc: stern@rowland.harvard.edu
Link: http://lkml.kernel.org/r/20180716180605.16115-11-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h | 53 ++++++++++++++++++++++++++++++++----------------
 kernel/sched/core.c      | 41 +++++++++++++++++++------------------
 2 files changed, 57 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index fd57888d4942..3190997df9ca 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -114,29 +114,48 @@ do {								\
 #endif /*arch_spin_is_contended*/
 
 /*
- * This barrier must provide two things:
+ * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
+ * between program-order earlier lock acquisitions and program-order later
+ * memory accesses.
  *
- *   - it must guarantee a STORE before the spin_lock() is ordered against a
- *     LOAD after it, see the comments at its two usage sites.
+ * This guarantees that the following two properties hold:
  *
- *   - it must ensure the critical section is RCsc.
+ *   1) Given the snippet:
  *
- * The latter is important for cases where we observe values written by other
- * CPUs in spin-loops, without barriers, while being subject to scheduling.
+ *	  { X = 0;  Y = 0; }
  *
- * CPU0			CPU1			CPU2
+ *	  CPU0				CPU1
  *
- *			for (;;) {
- *			  if (READ_ONCE(X))
- *			    break;
- *			}
- * X=1
- *			<sched-out>
- *						<sched-in>
- *						r = X;
+ *	  WRITE_ONCE(X, 1);		WRITE_ONCE(Y, 1);
+ *	  spin_lock(S);			smp_mb();
+ *	  smp_mb__after_spinlock();	r1 = READ_ONCE(X);
+ *	  r0 = READ_ONCE(Y);
+ *	  spin_unlock(S);
  *
- * without transitivity it could be that CPU1 observes X!=0 breaks the loop,
- * we get migrated and CPU2 sees X==0.
+ *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
+ *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
+ *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
+ *      try_to_wake_up().
+ *
+ *   2) Given the snippet:
+ *
+ *  { X = 0;  Y = 0; }
+ *
+ *  CPU0		CPU1				CPU2
+ *
+ *  spin_lock(S);	spin_lock(S);			r1 = READ_ONCE(Y);
+ *  WRITE_ONCE(X, 1);	smp_mb__after_spinlock();	smp_rmb();
+ *  spin_unlock(S);	r0 = READ_ONCE(X);		r2 = READ_ONCE(X);
+ *			WRITE_ONCE(Y, 1);
+ *			spin_unlock(S);
+ *
+ *      it is forbidden that CPU0's critical section executes before CPU1's
+ *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
+ *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
+ *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
+ *      snippets but "projected" onto two CPUs.
+ *
+ * Property (2) upgrades the lock to an RCsc lock.
  *
  * Since most load-store architectures implement ACQUIRE with an smp_mb() after
  * the LL/SC loop, they need no further barriers. Similarly all our TSO
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..0c5ec2abdf93 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1998,21 +1998,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
 	 * in smp_cond_load_acquire() below.
 	 *
-	 * sched_ttwu_pending()                 try_to_wake_up()
-	 *   [S] p->on_rq = 1;                  [L] P->state
-	 *       UNLOCK rq->lock  -----.
-	 *                              \
-	 *				 +---   RMB
-	 * schedule()                   /
-	 *       LOCK rq->lock    -----'
-	 *       UNLOCK rq->lock
+	 * sched_ttwu_pending()			try_to_wake_up()
+	 *   STORE p->on_rq = 1			  LOAD p->state
+	 *   UNLOCK rq->lock
+	 *
+	 * __schedule() (switch to task 'p')
+	 *   LOCK rq->lock			  smp_rmb();
+	 *   smp_mb__after_spinlock();
+	 *   UNLOCK rq->lock
 	 *
 	 * [task p]
-	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq
 	 *
-	 * Pairs with the UNLOCK+LOCK on rq->lock from the
-	 * last wakeup of our task and the schedule that got our task
-	 * current.
+	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+	 * __schedule().  See the comment for smp_mb__after_spinlock().
 	 */
 	smp_rmb();
 	if (p->on_rq && ttwu_remote(p, wake_flags))
@@ -2026,15 +2025,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * One must be running (->on_cpu == 1) in order to remove oneself
 	 * from the runqueue.
 	 *
-	 *  [S] ->on_cpu = 1;	[L] ->on_rq
-	 *      UNLOCK rq->lock
-	 *			RMB
-	 *      LOCK   rq->lock
-	 *  [S] ->on_rq = 0;    [L] ->on_cpu
+	 * __schedule() (switch to task 'p')	try_to_wake_up()
+	 *   STORE p->on_cpu = 1		  LOAD p->on_rq
+	 *   UNLOCK rq->lock
+	 *
+	 * __schedule() (put 'p' to sleep)
+	 *   LOCK rq->lock			  smp_rmb();
+	 *   smp_mb__after_spinlock();
+	 *   STORE p->on_rq = 0			  LOAD p->on_cpu
 	 *
-	 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
-	 * from the consecutive calls to schedule(); the first switching to our
-	 * task, the second putting it to sleep.
+	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+	 * __schedule().  See the comment for smp_mb__after_spinlock().
 	 */
 	smp_rmb();
 
-- 
cgit v1.2.3


From 7696f9910a9a40b8a952f57d3428515fabd2d889 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 16 Jul 2018 11:06:03 -0700
Subject: sched/Documentation: Update wake_up() & co. memory-barrier guarantees

Both the implementation and the users' expectation [1] for the various
wakeup primitives have evolved over time, but the documentation has not
kept up with these changes: brings it into 2018.

[1] http://lkml.kernel.org/r/20180424091510.GB4064@hirez.programming.kicks-ass.net

Also applied feedback from Alan Stern.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: parri.andrea@gmail.com
Link: http://lkml.kernel.org/r/20180716180605.16115-12-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/memory-barriers.txt | 43 ++++++++++++++++++++++++---------------
 include/linux/sched.h             |  4 ++--
 kernel/sched/completion.c         |  8 ++++----
 kernel/sched/core.c               | 30 +++++++++++----------------
 kernel/sched/wait.c               |  8 ++++----
 5 files changed, 49 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index a02d6bbfc9d0..0d8d7ef131e9 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -2179,32 +2179,41 @@ or:
 	event_indicated = 1;
 	wake_up_process(event_daemon);
 
-A write memory barrier is implied by wake_up() and co.  if and only if they
-wake something up.  The barrier occurs before the task state is cleared, and so
-sits between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+A general memory barrier is executed by wake_up() if it wakes something up.
+If it doesn't wake anything up then a memory barrier may or may not be
+executed; you must not rely on it.  The barrier occurs before the task state
+is accessed, in particular, it sits between the STORE to indicate the event
+and the STORE to set TASK_RUNNING:
 
-	CPU 1				CPU 2
+	CPU 1 (Sleeper)			CPU 2 (Waker)
 	===============================	===============================
 	set_current_state();		STORE event_indicated
 	  smp_store_mb();		wake_up();
-	    STORE current->state	  <write barrier>
-	    <general barrier>		  STORE current->state
-	LOAD event_indicated
+	    STORE current->state	  ...
+	    <general barrier>		  <general barrier>
+	LOAD event_indicated		  if ((LOAD task->state) & TASK_NORMAL)
+					    STORE task->state
 
-To repeat, this write memory barrier is present if and only if something
-is actually awakened.  To see this, consider the following sequence of
-events, where X and Y are both initially zero:
+where "task" is the thread being woken up and it equals CPU 1's "current".
+
+To repeat, a general memory barrier is guaranteed to be executed by wake_up()
+if something is actually awakened, but otherwise there is no such guarantee.
+To see this, consider the following sequence of events, where X and Y are both
+initially zero:
 
 	CPU 1				CPU 2
 	===============================	===============================
-	X = 1;				STORE event_indicated
+	X = 1;				Y = 1;
 	smp_mb();			wake_up();
-	Y = 1;				wait_event(wq, Y == 1);
-	wake_up();			  load from Y sees 1, no memory barrier
-					load from X might see 0
+	LOAD Y				LOAD X
+
+If a wakeup does occur, one (at least) of the two loads must see 1.  If, on
+the other hand, a wakeup does not occur, both loads might see 0.
 
-In contrast, if a wakeup does occur, CPU 2's load from X would be guaranteed
-to see 1.
+wake_up_process() always executes a general memory barrier.  The barrier again
+occurs before the task state is accessed.  In particular, if the wake_up() in
+the previous snippet were replaced by a call to wake_up_process() then one of
+the two loads would be guaranteed to see 1.
 
 The available waker functions include:
 
@@ -2224,6 +2233,8 @@ The available waker functions include:
 	wake_up_poll();
 	wake_up_process();
 
+In terms of memory ordering, these functions all provide the same guarantees of
+a wake_up() (or stronger).
 
 [!] Note that the memory barriers implied by the sleeper and the waker do _not_
 order multiple stores before the wake-up with respect to loads of those stored
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..05cd419f962d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -167,8 +167,8 @@ struct task_group;
  *   need_sleep = false;
  *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
  *
- * Where wake_up_state() (and all other wakeup primitives) imply enough
- * barriers to order the store of the variable against wakeup.
+ * where wake_up_state() executes a full memory barrier before accessing the
+ * task state.
  *
  * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
  * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index e426b0cb9ac6..a1ad5b7d5521 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -22,8 +22,8 @@
  *
  * See also complete_all(), wait_for_completion() and related routines.
  *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
  */
 void complete(struct completion *x)
 {
@@ -44,8 +44,8 @@ EXPORT_SYMBOL(complete);
  *
  * This will wake up all threads waiting on this particular completion event.
  *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
  *
  * Since complete_all() sets the completion of @x permanently to done
  * to allow multiple waiters to finish, a call to reinit_completion()
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c5ec2abdf93..a0065c84e73f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -412,8 +412,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	 * its already queued (either by us or someone else) and will get the
 	 * wakeup due to that.
 	 *
-	 * This cmpxchg() implies a full barrier, which pairs with the write
-	 * barrier implied by the wakeup in wake_up_q().
+	 * This cmpxchg() executes a full barrier, which pairs with the full
+	 * barrier executed by the wakeup in wake_up_q().
 	 */
 	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 		return;
@@ -441,8 +441,8 @@ void wake_up_q(struct wake_q_head *head)
 		task->wake_q.next = NULL;
 
 		/*
-		 * wake_up_process() implies a wmb() to pair with the queueing
-		 * in wake_q_add() so as not to miss wakeups.
+		 * wake_up_process() executes a full barrier, which pairs with
+		 * the queueing in wake_q_add() so as not to miss wakeups.
 		 */
 		wake_up_process(task);
 		put_task_struct(task);
@@ -1879,8 +1879,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *     rq(c1)->lock (if not at the same time, then in that order).
  *  C) LOCK of the rq(c1)->lock scheduling in task
  *
- * Transitivity guarantees that B happens after A and C after B.
- * Note: we only require RCpc transitivity.
+ * Release/acquire chaining guarantees that B happens after A and C after B.
  * Note: the CPU doing B need not be c0 or c1
  *
  * Example:
@@ -1942,16 +1941,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *   UNLOCK rq(0)->lock
  *
  *
- * However; for wakeups there is a second guarantee we must provide, namely we
- * must observe the state that lead to our wakeup. That is, not only must our
- * task observe its own prior state, it must also observe the stores prior to
- * its wakeup.
- *
- * This means that any means of doing remote wakeups must order the CPU doing
- * the wakeup against the CPU the task is going to end up running on. This,
- * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
- *
+ * However, for wakeups there is a second guarantee we must provide, namely we
+ * must ensure that CONDITION=1 done by the caller can not be reordered with
+ * accesses to the task state; see try_to_wake_up() and set_current_state().
  */
 
 /**
@@ -1967,6 +1959,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * Atomic against schedule() which would dequeue a task, also see
  * set_current_state().
  *
+ * This function executes a full memory barrier before accessing the task
+ * state; see set_current_state().
+ *
  * Return: %true if @p->state changes (an actual wakeup was done),
  *	   %false otherwise.
  */
@@ -2141,8 +2136,7 @@ out:
  *
  * Return: 1 if the process was woken up, 0 if it was already running.
  *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * This function executes a full memory barrier before accessing the task state.
  */
 int wake_up_process(struct task_struct *p)
 {
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index a7a2aaa3026a..870f97b313e3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -134,8 +134,8 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
  */
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
@@ -180,8 +180,8 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  *
  * On UP it can prevent extra preemption.
  *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
  */
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
-- 
cgit v1.2.3


From c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 16 Jul 2018 15:03:31 -0400
Subject: mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on
 nr_cpu_ids

The mm_struct always contains a cpumask bitmap, regardless of
CONFIG_CPUMASK_OFFSTACK. That means the first step can be to
simplify things, and simply have one bitmask at the end of the
mm_struct for the mm_cpumask.

This does necessitate moving everything else in mm_struct into
an anonymous sub-structure, which can be randomized when struct
randomization is enabled.

The second step is to determine the correct size for the
mm_struct slab object from the size of the mm_struct
(excluding the CPU bitmap) and the size the cpumask.

For init_mm we can simply allocate the maximum size this
kernel is compiled for, since we only have one init_mm
in the system, anyway.

Pointer magic by Mike Galbraith, to evade -Wstringop-overflow
getting confused by the dynamically sized array.

Tested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Rik van Riel <riel@surriel.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: luto@kernel.org
Link: http://lkml.kernel.org/r/20180716190337.26133-2-riel@surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c |   1 +
 include/linux/mm_types.h   | 241 +++++++++++++++++++++++----------------------
 kernel/fork.c              |  15 +--
 mm/init-mm.c               |  11 +++
 4 files changed, 145 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 232f4915223b..7f0b19410a95 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
 	.mmap_sem		= __RWSEM_INITIALIZER(efi_mm.mmap_sem),
 	.page_table_lock	= __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
 	.mmlist			= LIST_HEAD_INIT(efi_mm.mmlist),
+	.cpu_bitmap		= { [BITS_TO_LONGS(NR_CPUS)] = 0},
 };
 
 static bool disable_runtime;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 99ce070e7dcb..efdc24dd9e97 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,176 +335,183 @@ struct core_state {
 
 struct kioctx_table;
 struct mm_struct {
-	struct vm_area_struct *mmap;		/* list of VMAs */
-	struct rb_root mm_rb;
-	u32 vmacache_seqnum;                   /* per-thread vmacache */
+	struct {
+		struct vm_area_struct *mmap;		/* list of VMAs */
+		struct rb_root mm_rb;
+		u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
-	unsigned long (*get_unmapped_area) (struct file *filp,
+		unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
 #endif
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
+		unsigned long mmap_base;	/* base of mmap area */
+		unsigned long mmap_legacy_base;	/* base of mmap area in bottom-up allocations */
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
-	/* Base adresses for compatible mmap() */
-	unsigned long mmap_compat_base;
-	unsigned long mmap_compat_legacy_base;
+		/* Base adresses for compatible mmap() */
+		unsigned long mmap_compat_base;
+		unsigned long mmap_compat_legacy_base;
 #endif
-	unsigned long task_size;		/* size of task vm space */
-	unsigned long highest_vm_end;		/* highest vma end address */
-	pgd_t * pgd;
-
-	/**
-	 * @mm_users: The number of users including userspace.
-	 *
-	 * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
-	 * to 0 (i.e. when the task exits and there are no other temporary
-	 * reference holders), we also release a reference on @mm_count
-	 * (which may then free the &struct mm_struct if @mm_count also
-	 * drops to 0).
-	 */
-	atomic_t mm_users;
-
-	/**
-	 * @mm_count: The number of references to &struct mm_struct
-	 * (@mm_users count as 1).
-	 *
-	 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
-	 * &struct mm_struct is freed.
-	 */
-	atomic_t mm_count;
+		unsigned long task_size;	/* size of task vm space */
+		unsigned long highest_vm_end;	/* highest vma end address */
+		pgd_t * pgd;
+
+		/**
+		 * @mm_users: The number of users including userspace.
+		 *
+		 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
+		 * drops to 0 (i.e. when the task exits and there are no other
+		 * temporary reference holders), we also release a reference on
+		 * @mm_count (which may then free the &struct mm_struct if
+		 * @mm_count also drops to 0).
+		 */
+		atomic_t mm_users;
+
+		/**
+		 * @mm_count: The number of references to &struct mm_struct
+		 * (@mm_users count as 1).
+		 *
+		 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
+		 * &struct mm_struct is freed.
+		 */
+		atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t pgtables_bytes;		/* PTE page table pages */
+		atomic_long_t pgtables_bytes;	/* PTE page table pages */
 #endif
-	int map_count;				/* number of VMAs */
+		int map_count;			/* number of VMAs */
 
-	spinlock_t page_table_lock;		/* Protects page tables and some counters */
-	struct rw_semaphore mmap_sem;
+		spinlock_t page_table_lock; /* Protects page tables and some
+					     * counters
+					     */
+		struct rw_semaphore mmap_sem;
 
-	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
-						 * together off init_mm.mmlist, and are protected
-						 * by mmlist_lock
-						 */
+		struct list_head mmlist; /* List of maybe swapped mm's.	These
+					  * are globally strung together off
+					  * init_mm.mmlist, and are protected
+					  * by mmlist_lock
+					  */
 
 
-	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
-	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+		unsigned long hiwater_rss; /* High-watermark of RSS usage */
+		unsigned long hiwater_vm;  /* High-water virtual memory usage */
 
-	unsigned long total_vm;		/* Total pages mapped */
-	unsigned long locked_vm;	/* Pages that have PG_mlocked set */
-	unsigned long pinned_vm;	/* Refcount permanently increased */
-	unsigned long data_vm;		/* VM_WRITE & ~VM_SHARED & ~VM_STACK */
-	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE & ~VM_STACK */
-	unsigned long stack_vm;		/* VM_STACK */
-	unsigned long def_flags;
+		unsigned long total_vm;	   /* Total pages mapped */
+		unsigned long locked_vm;   /* Pages that have PG_mlocked set */
+		unsigned long pinned_vm;   /* Refcount permanently increased */
+		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+		unsigned long stack_vm;	   /* VM_STACK */
+		unsigned long def_flags;
 
-	spinlock_t arg_lock; /* protect the below fields */
-	unsigned long start_code, end_code, start_data, end_data;
-	unsigned long start_brk, brk, start_stack;
-	unsigned long arg_start, arg_end, env_start, env_end;
+		spinlock_t arg_lock; /* protect the below fields */
+		unsigned long start_code, end_code, start_data, end_data;
+		unsigned long start_brk, brk, start_stack;
+		unsigned long arg_start, arg_end, env_start, env_end;
 
-	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+		unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
-	/*
-	 * Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	struct mm_rss_stat rss_stat;
-
-	struct linux_binfmt *binfmt;
+		/*
+		 * Special counters, in some configurations protected by the
+		 * page_table_lock, in other configurations by being atomic.
+		 */
+		struct mm_rss_stat rss_stat;
 
-	cpumask_var_t cpu_vm_mask_var;
+		struct linux_binfmt *binfmt;
 
-	/* Architecture-specific MM context */
-	mm_context_t context;
+		/* Architecture-specific MM context */
+		mm_context_t context;
 
-	unsigned long flags; /* Must use atomic bitops to access the bits */
+		unsigned long flags; /* Must use atomic bitops to access */
 
-	struct core_state *core_state; /* coredumping support */
+		struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_MEMBARRIER
-	atomic_t membarrier_state;
+		atomic_t membarrier_state;
 #endif
 #ifdef CONFIG_AIO
-	spinlock_t			ioctx_lock;
-	struct kioctx_table __rcu	*ioctx_table;
+		spinlock_t			ioctx_lock;
+		struct kioctx_table __rcu	*ioctx_table;
 #endif
 #ifdef CONFIG_MEMCG
-	/*
-	 * "owner" points to a task that is regarded as the canonical
-	 * user/owner of this mm. All of the following must be true in
-	 * order for it to be changed:
-	 *
-	 * current == mm->owner
-	 * current->mm != mm
-	 * new_owner->mm == mm
-	 * new_owner->alloc_lock is held
-	 */
-	struct task_struct __rcu *owner;
+		/*
+		 * "owner" points to a task that is regarded as the canonical
+		 * user/owner of this mm. All of the following must be true in
+		 * order for it to be changed:
+		 *
+		 * current == mm->owner
+		 * current->mm != mm
+		 * new_owner->mm == mm
+		 * new_owner->alloc_lock is held
+		 */
+		struct task_struct __rcu *owner;
 #endif
-	struct user_namespace *user_ns;
+		struct user_namespace *user_ns;
 
-	/* store ref to file /proc/<pid>/exe symlink points to */
-	struct file __rcu *exe_file;
+		/* store ref to file /proc/<pid>/exe symlink points to */
+		struct file __rcu *exe_file;
 #ifdef CONFIG_MMU_NOTIFIER
-	struct mmu_notifier_mm *mmu_notifier_mm;
+		struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
-#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	struct cpumask cpumask_allocation;
+		pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-	/*
-	 * numa_next_scan is the next time that the PTEs will be marked
-	 * pte_numa. NUMA hinting faults will gather statistics and migrate
-	 * pages to new nodes if necessary.
-	 */
-	unsigned long numa_next_scan;
+		/*
+		 * numa_next_scan is the next time that the PTEs will be marked
+		 * pte_numa. NUMA hinting faults will gather statistics and
+		 * migrate pages to new nodes if necessary.
+		 */
+		unsigned long numa_next_scan;
 
-	/* Restart point for scanning and setting pte_numa */
-	unsigned long numa_scan_offset;
+		/* Restart point for scanning and setting pte_numa */
+		unsigned long numa_scan_offset;
 
-	/* numa_scan_seq prevents two threads setting pte_numa */
-	int numa_scan_seq;
+		/* numa_scan_seq prevents two threads setting pte_numa */
+		int numa_scan_seq;
 #endif
-	/*
-	 * An operation with batched TLB flushing is going on. Anything that
-	 * can move process memory needs to flush the TLB when moving a
-	 * PROT_NONE or PROT_NUMA mapped page.
-	 */
-	atomic_t tlb_flush_pending;
+		/*
+		 * An operation with batched TLB flushing is going on. Anything
+		 * that can move process memory needs to flush the TLB when
+		 * moving a PROT_NONE or PROT_NUMA mapped page.
+		 */
+		atomic_t tlb_flush_pending;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-	/* See flush_tlb_batched_pending() */
-	bool tlb_flush_batched;
+		/* See flush_tlb_batched_pending() */
+		bool tlb_flush_batched;
 #endif
-	struct uprobes_state uprobes_state;
+		struct uprobes_state uprobes_state;
 #ifdef CONFIG_HUGETLB_PAGE
-	atomic_long_t hugetlb_usage;
+		atomic_long_t hugetlb_usage;
 #endif
-	struct work_struct async_put_work;
+		struct work_struct async_put_work;
 
 #if IS_ENABLED(CONFIG_HMM)
-	/* HMM needs to track a few things per mm */
-	struct hmm *hmm;
+		/* HMM needs to track a few things per mm */
+		struct hmm *hmm;
 #endif
-} __randomize_layout;
+	} __randomize_layout;
+
+	/*
+	 * The mm_cpumask needs to be at the end of mm_struct, because it
+	 * is dynamically sized based on nr_cpu_ids.
+	 */
+	unsigned long cpu_bitmap[];
+};
 
 extern struct mm_struct init_mm;
 
+/* Pointer magic because the dynamic array size confuses some compilers. */
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	mm->cpu_vm_mask_var = &mm->cpumask_allocation;
-#endif
-	cpumask_clear(mm->cpu_vm_mask_var);
+	unsigned long cpu_bitmap = (unsigned long)mm;
+
+	cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
+	cpumask_clear((struct cpumask *)cpu_bitmap);
 }
 
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 {
-	return mm->cpu_vm_mask_var;
+	return (struct cpumask *)&mm->cpu_bitmap;
 }
 
 struct mmu_gather;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..5b64c1b8461e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2253,6 +2253,8 @@ static void sighand_ctor(void *data)
 
 void __init proc_caches_init(void)
 {
+	unsigned int mm_size;
+
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2269,15 +2271,16 @@ void __init proc_caches_init(void)
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
+
 	/*
-	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
-	 * whole struct cpumask for the OFFSTACK case. We could change
-	 * this to *only* allocate as much of it as required by the
-	 * maximum number of CPU's we can ever have.  The cpumask_allocation
-	 * is at the end of the structure, exactly for that reason.
+	 * The mm_cpumask is located at the end of mm_struct, and is
+	 * dynamically sized based on the maximum CPU number this system
+	 * can have, taking hotplug into account (nr_cpu_ids).
 	 */
+	mm_size = sizeof(struct mm_struct) + cpumask_size();
+
 	mm_cachep = kmem_cache_create_usercopy("mm_struct",
-			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+			mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			offsetof(struct mm_struct, saved_auxv),
 			sizeof_field(struct mm_struct, saved_auxv),
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
 #define INIT_MM_CONTEXT(name)
 #endif
 
+/*
+ * For dynamically allocated mm_structs, there is a dynamically sized cpumask
+ * at the end of the structure, the size of which depends on the maximum CPU
+ * number the system can see. That way we allocate only as much memory for
+ * mm_cpumask() as needed for the hundreds, or thousands of processes that
+ * a system typically runs.
+ *
+ * Since there is only one init_mm in the entire system, keep it simple
+ * and size this cpu_bitmask to NR_CPUS.
+ */
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
+	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
cgit v1.2.3


From 4aaf448fa9754e2d5ee188d32327b24ffc15ca4d Mon Sep 17 00:00:00 2001
From: Jim Qu <Jim.Qu@amd.com>
Date: Tue, 17 Jul 2018 16:20:50 +0800
Subject: vga_switcheroo: set audio client id according to bound GPU id

On modern laptop, there are more and more platforms
have two GPUs, and each of them maybe have audio codec
for HDMP/DP output. For some dGPU which is no output,
audio codec usually is disabled.

In currect HDA audio driver, it will set all codec as
VGA_SWITCHEROO_DIS, the audio which is binded to UMA
will be suspended if user use debugfs to contorl power

In HDA driver side, it is difficult to know which GPU
the audio has binded to. So set the bound gpu pci dev
to vga_switcheroo.

if the audio client is not the third registration, audio
id will set in vga_switcheroo enable function. if the
audio client is the last registration when vga_switcheroo
_ready() get true, we should get audio client id from bound
GPU directly.

Signed-off-by: Jim Qu <Jim.Qu@amd.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/gpu/vga/vga_switcheroo.c | 63 +++++++++++++++++++++++++++++++++-------
 include/linux/vga_switcheroo.h   |  8 ++---
 sound/pci/hda/hda_intel.c        | 11 +++----
 3 files changed, 62 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c
index fc4adf3d34e8..a96bf46bc483 100644
--- a/drivers/gpu/vga/vga_switcheroo.c
+++ b/drivers/gpu/vga/vga_switcheroo.c
@@ -103,9 +103,11 @@
  *	runtime pm. If true, writing ON and OFF to the vga_switcheroo debugfs
  *	interface is a no-op so as not to interfere with runtime pm
  * @list: client list
+ * @vga_dev: pci device, indicate which GPU is bound to current audio client
  *
  * Registered client. A client can be either a GPU or an audio device on a GPU.
- * For audio clients, the @fb_info and @active members are bogus.
+ * For audio clients, the @fb_info and @active members are bogus. For GPU
+ * clients, the @vga_dev is bogus.
  */
 struct vga_switcheroo_client {
 	struct pci_dev *pdev;
@@ -116,6 +118,7 @@ struct vga_switcheroo_client {
 	bool active;
 	bool driver_power_control;
 	struct list_head list;
+	struct pci_dev *vga_dev;
 };
 
 /*
@@ -161,9 +164,8 @@ struct vgasr_priv {
 };
 
 #define ID_BIT_AUDIO		0x100
-#define client_is_audio(c)	((c)->id & ID_BIT_AUDIO)
-#define client_is_vga(c)	((c)->id == VGA_SWITCHEROO_UNKNOWN_ID || \
-				 !client_is_audio(c))
+#define client_is_audio(c)		((c)->id & ID_BIT_AUDIO)
+#define client_is_vga(c)		(!client_is_audio(c))
 #define client_id(c)		((c)->id & ~ID_BIT_AUDIO)
 
 static int vga_switcheroo_debugfs_init(struct vgasr_priv *priv);
@@ -192,14 +194,29 @@ static void vga_switcheroo_enable(void)
 		vgasr_priv.handler->init();
 
 	list_for_each_entry(client, &vgasr_priv.clients, list) {
-		if (client->id != VGA_SWITCHEROO_UNKNOWN_ID)
+		if (!client_is_vga(client) ||
+		     client_id(client) != VGA_SWITCHEROO_UNKNOWN_ID)
 			continue;
+
 		ret = vgasr_priv.handler->get_client_id(client->pdev);
 		if (ret < 0)
 			return;
 
 		client->id = ret;
 	}
+
+	list_for_each_entry(client, &vgasr_priv.clients, list) {
+		if (!client_is_audio(client) ||
+		     client_id(client) != VGA_SWITCHEROO_UNKNOWN_ID)
+			continue;
+
+		ret = vgasr_priv.handler->get_client_id(client->vga_dev);
+		if (ret < 0)
+			return;
+
+		client->id = ret | ID_BIT_AUDIO;
+	}
+
 	vga_switcheroo_debugfs_init(&vgasr_priv);
 	vgasr_priv.active = true;
 }
@@ -272,7 +289,9 @@ EXPORT_SYMBOL(vga_switcheroo_handler_flags);
 
 static int register_client(struct pci_dev *pdev,
 			   const struct vga_switcheroo_client_ops *ops,
-			   enum vga_switcheroo_client_id id, bool active,
+			   enum vga_switcheroo_client_id id,
+			   struct pci_dev *vga_dev,
+			   bool active,
 			   bool driver_power_control)
 {
 	struct vga_switcheroo_client *client;
@@ -287,6 +306,7 @@ static int register_client(struct pci_dev *pdev,
 	client->id = id;
 	client->active = active;
 	client->driver_power_control = driver_power_control;
+	client->vga_dev = vga_dev;
 
 	mutex_lock(&vgasr_mutex);
 	list_add_tail(&client->list, &vgasr_priv.clients);
@@ -319,7 +339,7 @@ int vga_switcheroo_register_client(struct pci_dev *pdev,
 				   const struct vga_switcheroo_client_ops *ops,
 				   bool driver_power_control)
 {
-	return register_client(pdev, ops, VGA_SWITCHEROO_UNKNOWN_ID,
+	return register_client(pdev, ops, VGA_SWITCHEROO_UNKNOWN_ID, NULL,
 			       pdev == vga_default_device(),
 			       driver_power_control);
 }
@@ -329,19 +349,40 @@ EXPORT_SYMBOL(vga_switcheroo_register_client);
  * vga_switcheroo_register_audio_client - register audio client
  * @pdev: client pci device
  * @ops: client callbacks
- * @id: client identifier
+ * @vga_dev:  pci device which is bound to current audio client
  *
  * Register audio client (audio device on a GPU). The client is assumed
  * to use runtime PM. Beforehand, vga_switcheroo_client_probe_defer()
  * shall be called to ensure that all prerequisites are met.
  *
- * Return: 0 on success, -ENOMEM on memory allocation error.
+ * Return: 0 on success, -ENOMEM on memory allocation error, -EINVAL on getting
+ * client id error.
  */
 int vga_switcheroo_register_audio_client(struct pci_dev *pdev,
 			const struct vga_switcheroo_client_ops *ops,
-			enum vga_switcheroo_client_id id)
+			struct pci_dev *vga_dev)
 {
-	return register_client(pdev, ops, id | ID_BIT_AUDIO, false, true);
+	enum vga_switcheroo_client_id id = VGA_SWITCHEROO_UNKNOWN_ID;
+
+	/*
+	 * if vga_switcheroo has enabled, that mean two GPU clients and also
+	 * handler are registered. Get audio client id from bound GPU client
+	 * id directly, otherwise, set it as VGA_SWITCHEROO_UNKNOWN_ID,
+	 * it will set to correct id in later when vga_switcheroo_enable()
+	 * is called.
+	 */
+	mutex_lock(&vgasr_mutex);
+	if (vgasr_priv.active) {
+		id = vgasr_priv.handler->get_client_id(vga_dev);
+		if (id < 0) {
+			mutex_unlock(&vgasr_mutex);
+			return -EINVAL;
+		}
+	}
+	mutex_unlock(&vgasr_mutex);
+
+	return register_client(pdev, ops, id | ID_BIT_AUDIO, vga_dev,
+			       false, true);
 }
 EXPORT_SYMBOL(vga_switcheroo_register_audio_client);
 
diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h
index 77f0f0af3a71..a34539b7f750 100644
--- a/include/linux/vga_switcheroo.h
+++ b/include/linux/vga_switcheroo.h
@@ -84,8 +84,8 @@ enum vga_switcheroo_state {
  * Client identifier. Audio clients use the same identifier & 0x100.
  */
 enum vga_switcheroo_client_id {
-	VGA_SWITCHEROO_UNKNOWN_ID = -1,
-	VGA_SWITCHEROO_IGD,
+	VGA_SWITCHEROO_UNKNOWN_ID = 0x1000,
+	VGA_SWITCHEROO_IGD = 0,
 	VGA_SWITCHEROO_DIS,
 	VGA_SWITCHEROO_MAX_CLIENTS,
 };
@@ -151,7 +151,7 @@ int vga_switcheroo_register_client(struct pci_dev *dev,
 				   bool driver_power_control);
 int vga_switcheroo_register_audio_client(struct pci_dev *pdev,
 					 const struct vga_switcheroo_client_ops *ops,
-					 enum vga_switcheroo_client_id id);
+					 struct pci_dev *vga_dev);
 
 void vga_switcheroo_client_fb_set(struct pci_dev *dev,
 				  struct fb_info *info);
@@ -180,7 +180,7 @@ static inline int vga_switcheroo_register_handler(const struct vga_switcheroo_ha
 		enum vga_switcheroo_handler_flags_t handler_flags) { return 0; }
 static inline int vga_switcheroo_register_audio_client(struct pci_dev *pdev,
 	const struct vga_switcheroo_client_ops *ops,
-	enum vga_switcheroo_client_id id) { return 0; }
+	struct pci_dev *vga_dev) { return 0; }
 static inline void vga_switcheroo_unregister_handler(void) {}
 static inline enum vga_switcheroo_handler_flags_t vga_switcheroo_handler_flags(void) { return 0; }
 static inline int vga_switcheroo_lock_ddc(struct pci_dev *pdev) { return -ENODEV; }
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 1ae1850b3bfd..1967a5537d68 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -1319,15 +1319,16 @@ static const struct vga_switcheroo_client_ops azx_vs_ops = {
 static int register_vga_switcheroo(struct azx *chip)
 {
 	struct hda_intel *hda = container_of(chip, struct hda_intel, chip);
+	struct pci_dev *p;
 	int err;
 
 	if (!hda->use_vga_switcheroo)
 		return 0;
-	/* FIXME: currently only handling DIS controller
-	 * is there any machine with two switchable HDMI audio controllers?
-	 */
-	err = vga_switcheroo_register_audio_client(chip->pci, &azx_vs_ops,
-						   VGA_SWITCHEROO_DIS);
+
+	p = get_bound_vga(chip->pci);
+	err = vga_switcheroo_register_audio_client(chip->pci, &azx_vs_ops, p);
+	pci_dev_put(p);
+
 	if (err < 0)
 		return err;
 	hda->vga_switcheroo_registered = 1;
-- 
cgit v1.2.3


From 9afb719e7046c4f2462278862ab3db2961cc141c Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Thu, 5 Jul 2018 17:49:37 -0700
Subject: kbuild: Add build salt to the kernel and modules

In Fedora, the debug information is packaged separately (foo-debuginfo) and
can be installed separately. There's been a long standing issue where only
one version of a debuginfo info package can be installed at a time. There's
been an effort for Fedora for parallel debuginfo to rectify this problem.

Part of the requirement to allow parallel debuginfo to work is that build ids
are unique between builds. The existing upstream rpm implementation ensures
this by re-calculating the build-id using the version and release as a
seed. This doesn't work 100% for the kernel because of the vDSO which is
its own binary and doesn't get updated when embedded.

Fix this by adding some data in an ELF note for both the kernel and modules.
The data is controlled via a Kconfig option so distributions can set it
to an appropriate value to ensure uniqueness between builds.

Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/linux/build-salt.h | 20 ++++++++++++++++++++
 init/Kconfig               |  9 +++++++++
 init/version.c             |  3 +++
 scripts/mod/modpost.c      |  3 +++
 4 files changed, 35 insertions(+)
 create mode 100644 include/linux/build-salt.h

(limited to 'include/linux')

diff --git a/include/linux/build-salt.h b/include/linux/build-salt.h
new file mode 100644
index 000000000000..bb007bd05e7a
--- /dev/null
+++ b/include/linux/build-salt.h
@@ -0,0 +1,20 @@
+#ifndef __BUILD_SALT_H
+#define __BUILD_SALT_H
+
+#include <linux/elfnote.h>
+
+#define LINUX_ELFNOTE_BUILD_SALT       0x100
+
+#ifdef __ASSEMBLER__
+
+#define BUILD_SALT \
+       ELFNOTE(Linux, LINUX_ELFNOTE_BUILD_SALT, .asciz CONFIG_BUILD_SALT)
+
+#else
+
+#define BUILD_SALT \
+       ELFNOTE32("Linux", LINUX_ELFNOTE_BUILD_SALT, CONFIG_BUILD_SALT)
+
+#endif
+
+#endif /* __BUILD_SALT_H */
diff --git a/init/Kconfig b/init/Kconfig
index 041f3a022122..d39b31484c52 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -107,6 +107,15 @@ config LOCALVERSION_AUTO
 
 	  which is done within the script "scripts/setlocalversion".)
 
+config BUILD_SALT
+       string "Build ID Salt"
+       default ""
+       help
+          The build ID is used to link binaries and their debug info. Setting
+          this option will use the value in the calculation of the build id.
+          This is mostly useful for distributions which want to ensure the
+          build is unique between builds. It's safe to leave the default.
+
 config HAVE_KERNEL_GZIP
 	bool
 
diff --git a/init/version.c b/init/version.c
index bfb4e3f4955e..ef4012ec4375 100644
--- a/init/version.c
+++ b/init/version.c
@@ -7,6 +7,7 @@
  */
 
 #include <generated/compile.h>
+#include <linux/build-salt.h>
 #include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
@@ -49,3 +50,5 @@ const char linux_proc_banner[] =
 	"%s version %s"
 	" (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")"
 	" (" LINUX_COMPILER ") %s\n";
+
+BUILD_SALT;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 1663fb19343a..dc6d714e4dcb 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2125,10 +2125,13 @@ static int check_modname_len(struct module *mod)
  **/
 static void add_header(struct buffer *b, struct module *mod)
 {
+	buf_printf(b, "#include <linux/build-salt.h>\n");
 	buf_printf(b, "#include <linux/module.h>\n");
 	buf_printf(b, "#include <linux/vermagic.h>\n");
 	buf_printf(b, "#include <linux/compiler.h>\n");
 	buf_printf(b, "\n");
+	buf_printf(b, "BUILD_SALT;\n");
+	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
 	buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
 	buf_printf(b, "\n");
-- 
cgit v1.2.3


From 2fe31e43124040a67495fa81f7ac67bc4c09ebc8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 17 Jul 2018 21:48:10 +0200
Subject: hwmon: Add missing HWMON_T_LCRIT_ALARM define

The enum hwmon_temp_lcrit_alarm exists, but the BIT definition is
missing.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/hwmon.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index e5fd2707b6df..1b74ad11a5a4 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -93,6 +93,7 @@ enum hwmon_temp_attributes {
 #define HWMON_T_MIN_ALARM	BIT(hwmon_temp_min_alarm)
 #define HWMON_T_MAX_ALARM	BIT(hwmon_temp_max_alarm)
 #define HWMON_T_CRIT_ALARM	BIT(hwmon_temp_crit_alarm)
+#define HWMON_T_LCRIT_ALARM	BIT(hwmon_temp_lcrit_alarm)
 #define HWMON_T_EMERGENCY_ALARM	BIT(hwmon_temp_emergency_alarm)
 #define HWMON_T_FAULT		BIT(hwmon_temp_fault)
 #define HWMON_T_OFFSET		BIT(hwmon_temp_offset)
-- 
cgit v1.2.3


From aa7f29b07c8702127124d0522e3cd46850cdbc41 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 17 Jul 2018 21:48:11 +0200
Subject: hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm

Some sensors support reporting minimal and lower critical power, as
well as alarms when these thresholds are reached. Add support for
these attributes to the hwmon core.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hwmon/hwmon.c | 4 ++++
 include/linux/hwmon.h | 8 ++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index e88c01961948..33d51281272b 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -394,12 +394,16 @@ static const char * const hwmon_power_attr_templates[] = {
 	[hwmon_power_cap_hyst] = "power%d_cap_hyst",
 	[hwmon_power_cap_max] = "power%d_cap_max",
 	[hwmon_power_cap_min] = "power%d_cap_min",
+	[hwmon_power_min] = "power%d_min",
 	[hwmon_power_max] = "power%d_max",
+	[hwmon_power_lcrit] = "power%d_lcrit",
 	[hwmon_power_crit] = "power%d_crit",
 	[hwmon_power_label] = "power%d_label",
 	[hwmon_power_alarm] = "power%d_alarm",
 	[hwmon_power_cap_alarm] = "power%d_cap_alarm",
+	[hwmon_power_min_alarm] = "power%d_min_alarm",
 	[hwmon_power_max_alarm] = "power%d_max_alarm",
+	[hwmon_power_lcrit_alarm] = "power%d_lcrit_alarm",
 	[hwmon_power_crit_alarm] = "power%d_crit_alarm",
 };
 
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 1b74ad11a5a4..b217101ca76e 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -188,12 +188,16 @@ enum hwmon_power_attributes {
 	hwmon_power_cap_hyst,
 	hwmon_power_cap_max,
 	hwmon_power_cap_min,
+	hwmon_power_min,
 	hwmon_power_max,
 	hwmon_power_crit,
+	hwmon_power_lcrit,
 	hwmon_power_label,
 	hwmon_power_alarm,
 	hwmon_power_cap_alarm,
+	hwmon_power_min_alarm,
 	hwmon_power_max_alarm,
+	hwmon_power_lcrit_alarm,
 	hwmon_power_crit_alarm,
 };
 
@@ -214,12 +218,16 @@ enum hwmon_power_attributes {
 #define HWMON_P_CAP_HYST		BIT(hwmon_power_cap_hyst)
 #define HWMON_P_CAP_MAX			BIT(hwmon_power_cap_max)
 #define HWMON_P_CAP_MIN			BIT(hwmon_power_cap_min)
+#define HWMON_P_MIN			BIT(hwmon_power_min)
 #define HWMON_P_MAX			BIT(hwmon_power_max)
+#define HWMON_P_LCRIT			BIT(hwmon_power_lcrit)
 #define HWMON_P_CRIT			BIT(hwmon_power_crit)
 #define HWMON_P_LABEL			BIT(hwmon_power_label)
 #define HWMON_P_ALARM			BIT(hwmon_power_alarm)
 #define HWMON_P_CAP_ALARM		BIT(hwmon_power_cap_alarm)
+#define HWMON_P_MIN_ALARM		BIT(hwmon_power_max_alarm)
 #define HWMON_P_MAX_ALARM		BIT(hwmon_power_max_alarm)
+#define HWMON_P_LCRIT_ALARM		BIT(hwmon_power_lcrit_alarm)
 #define HWMON_P_CRIT_ALARM		BIT(hwmon_power_crit_alarm)
 
 enum hwmon_energy_attributes {
-- 
cgit v1.2.3


From dcb5d0fcaa1ddd0af9c18ef51e6b05e38a6cec71 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 17 Jul 2018 21:48:12 +0200
Subject: hwmon: Add helper to tell if a char is invalid in a name

HWMON device names are not allowed to contain "-* \t\n". Add a helper
which will return true if passed an invalid character. It can be used
to massage a string into a hwmon compatible name by replacing invalid
characters with '_'.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/hwmon.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index b217101ca76e..9493d4a388db 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -398,4 +398,27 @@ devm_hwmon_device_register_with_info(struct device *dev,
 void hwmon_device_unregister(struct device *dev);
 void devm_hwmon_device_unregister(struct device *dev);
 
+/**
+ * hwmon_is_bad_char - Is the char invalid in a hwmon name
+ * @ch: the char to be considered
+ *
+ * hwmon_is_bad_char() can be used to determine if the given character
+ * may not be used in a hwmon name.
+ *
+ * Returns true if the char is invalid, false otherwise.
+ */
+static inline bool hwmon_is_bad_char(const char ch)
+{
+	switch (ch) {
+	case '-':
+	case '*':
+	case ' ':
+	case '\t':
+	case '\n':
+		return true;
+	default:
+		return false;
+	}
+}
+
 #endif
-- 
cgit v1.2.3


From 1323061a018a7514287894a552c4ec2a5f0cb0cd Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 17 Jul 2018 21:48:13 +0200
Subject: net: phy: sfp: Add HWMON support for module sensors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SFP modules can contain a number of sensors. The EEPROM also contains
recommended alarm and critical values for each sensor, and indications
of if these have been exceeded. Export this information via
HWMON. Currently temperature, VCC, bias current, transmit power, and
possibly receiver power is supported.

The sensors in the modules can either return calibrate or uncalibrated
values. Uncalibrated values need to be manipulated, using coefficients
provided in the SFP EEPROM. Uncalibrated receive power values require
floating point maths in order to calibrate them. Performing this in
the kernel is hard. So if the SFP module indicates it uses
uncalibrated values, RX power is not made available.

With this hwmon device, it is possible to view the sensor values using
lm-sensors programs:

in0:          +3.29 V  (crit min =  +2.90 V, min =  +3.00 V)
                       (max =  +3.60 V, crit max =  +3.70 V)
temp1:        +33.0°C  (low  =  -5.0°C, high = +80.0°C)
                       (crit low = -10.0°C, crit = +85.0°C)
power1:      1000.00 nW (max = 794.00 uW, min =  50.00 uW)  ALARM (LCRIT)
                       (lcrit =  40.00 uW, crit = 1000.00 uW)
curr1:        +0.00 A  (crit min =  +0.00 A, min =  +0.00 A)  ALARM (LCRIT, MIN)
                       (max =  +0.01 A, crit max =  +0.01 A)

The scaling sensors performs on the bias current is not particularly
good. The raw values are more useful:

curr1:
  curr1_input: 0.000
  curr1_min: 0.002
  curr1_max: 0.010
  curr1_lcrit: 0.000
  curr1_crit: 0.011
  curr1_min_alarm: 1.000
  curr1_max_alarm: 0.000
  curr1_lcrit_alarm: 1.000
  curr1_crit_alarm: 0.000

In order to keep the I2C overhead to a minimum, the constant values,
such as limits and calibration coefficients are read once at module
insertion time. Thus only reading *_input and *_alarm properties
requires i2c read operations.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig |   1 +
 drivers/net/phy/sfp.c   | 727 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sfp.h     |  72 ++++-
 3 files changed, 799 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index ceede09a2845..9beac427f9e8 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -215,6 +215,7 @@ config SFP
 	tristate "SFP cage support"
 	depends on I2C && PHYLINK
 	select MDIO_I2C
+	imply HWMON
 
 config AMD_PHY
 	tristate "AMD PHYs"
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index c4c92db86dfa..5661226cf75b 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -1,5 +1,7 @@
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
+#include <linux/hwmon.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -131,6 +133,12 @@ struct sfp {
 	unsigned int sm_retries;
 
 	struct sfp_eeprom_id id;
+#if IS_ENABLED(CONFIG_HWMON)
+	struct sfp_diag diag;
+	struct device *hwmon_dev;
+	char *hwmon_name;
+#endif
+
 };
 
 static bool sff_module_supported(const struct sfp_eeprom_id *id)
@@ -316,6 +324,719 @@ static unsigned int sfp_check(void *buf, size_t len)
 	return check;
 }
 
+/* hwmon */
+#if IS_ENABLED(CONFIG_HWMON)
+static umode_t sfp_hwmon_is_visible(const void *data,
+				    enum hwmon_sensor_types type,
+				    u32 attr, int channel)
+{
+	const struct sfp *sfp = data;
+
+	switch (type) {
+	case hwmon_temp:
+		switch (attr) {
+		case hwmon_temp_input:
+		case hwmon_temp_min_alarm:
+		case hwmon_temp_max_alarm:
+		case hwmon_temp_lcrit_alarm:
+		case hwmon_temp_crit_alarm:
+		case hwmon_temp_min:
+		case hwmon_temp_max:
+		case hwmon_temp_lcrit:
+		case hwmon_temp_crit:
+			return 0444;
+		default:
+			return 0;
+		}
+	case hwmon_in:
+		switch (attr) {
+		case hwmon_in_input:
+		case hwmon_in_min_alarm:
+		case hwmon_in_max_alarm:
+		case hwmon_in_lcrit_alarm:
+		case hwmon_in_crit_alarm:
+		case hwmon_in_min:
+		case hwmon_in_max:
+		case hwmon_in_lcrit:
+		case hwmon_in_crit:
+			return 0444;
+		default:
+			return 0;
+		}
+	case hwmon_curr:
+		switch (attr) {
+		case hwmon_curr_input:
+		case hwmon_curr_min_alarm:
+		case hwmon_curr_max_alarm:
+		case hwmon_curr_lcrit_alarm:
+		case hwmon_curr_crit_alarm:
+		case hwmon_curr_min:
+		case hwmon_curr_max:
+		case hwmon_curr_lcrit:
+		case hwmon_curr_crit:
+			return 0444;
+		default:
+			return 0;
+		}
+	case hwmon_power:
+		/* External calibration of receive power requires
+		 * floating point arithmetic. Doing that in the kernel
+		 * is not easy, so just skip it. If the module does
+		 * not require external calibration, we can however
+		 * show receiver power, since FP is then not needed.
+		 */
+		if (sfp->id.ext.diagmon & SFP_DIAGMON_EXT_CAL &&
+		    channel == 1)
+			return 0;
+		switch (attr) {
+		case hwmon_power_input:
+		case hwmon_power_min_alarm:
+		case hwmon_power_max_alarm:
+		case hwmon_power_lcrit_alarm:
+		case hwmon_power_crit_alarm:
+		case hwmon_power_min:
+		case hwmon_power_max:
+		case hwmon_power_lcrit:
+		case hwmon_power_crit:
+			return 0444;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+static int sfp_hwmon_read_sensor(struct sfp *sfp, int reg, long *value)
+{
+	__be16 val;
+	int err;
+
+	err = sfp_read(sfp, true, reg, &val, sizeof(val));
+	if (err < 0)
+		return err;
+
+	*value = be16_to_cpu(val);
+
+	return 0;
+}
+
+static void sfp_hwmon_to_rx_power(long *value)
+{
+	*value = DIV_ROUND_CLOSEST(*value, 100);
+}
+
+static void sfp_hwmon_calibrate(struct sfp *sfp, unsigned int slope, int offset,
+				long *value)
+{
+	if (sfp->id.ext.diagmon & SFP_DIAGMON_EXT_CAL)
+		*value = DIV_ROUND_CLOSEST(*value * slope, 256) + offset;
+}
+
+static void sfp_hwmon_calibrate_temp(struct sfp *sfp, long *value)
+{
+	sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_t_slope),
+			    be16_to_cpu(sfp->diag.cal_t_offset), value);
+
+	if (*value >= 0x8000)
+		*value -= 0x10000;
+
+	*value = DIV_ROUND_CLOSEST(*value * 1000, 256);
+}
+
+static void sfp_hwmon_calibrate_vcc(struct sfp *sfp, long *value)
+{
+	sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_v_slope),
+			    be16_to_cpu(sfp->diag.cal_v_offset), value);
+
+	*value = DIV_ROUND_CLOSEST(*value, 10);
+}
+
+static void sfp_hwmon_calibrate_bias(struct sfp *sfp, long *value)
+{
+	sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_txi_slope),
+			    be16_to_cpu(sfp->diag.cal_txi_offset), value);
+
+	*value = DIV_ROUND_CLOSEST(*value, 500);
+}
+
+static void sfp_hwmon_calibrate_tx_power(struct sfp *sfp, long *value)
+{
+	sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_txpwr_slope),
+			    be16_to_cpu(sfp->diag.cal_txpwr_offset), value);
+
+	*value = DIV_ROUND_CLOSEST(*value, 10);
+}
+
+static int sfp_hwmon_read_temp(struct sfp *sfp, int reg, long *value)
+{
+	int err;
+
+	err = sfp_hwmon_read_sensor(sfp, reg, value);
+	if (err < 0)
+		return err;
+
+	sfp_hwmon_calibrate_temp(sfp, value);
+
+	return 0;
+}
+
+static int sfp_hwmon_read_vcc(struct sfp *sfp, int reg, long *value)
+{
+	int err;
+
+	err = sfp_hwmon_read_sensor(sfp, reg, value);
+	if (err < 0)
+		return err;
+
+	sfp_hwmon_calibrate_vcc(sfp, value);
+
+	return 0;
+}
+
+static int sfp_hwmon_read_bias(struct sfp *sfp, int reg, long *value)
+{
+	int err;
+
+	err = sfp_hwmon_read_sensor(sfp, reg, value);
+	if (err < 0)
+		return err;
+
+	sfp_hwmon_calibrate_bias(sfp, value);
+
+	return 0;
+}
+
+static int sfp_hwmon_read_tx_power(struct sfp *sfp, int reg, long *value)
+{
+	int err;
+
+	err = sfp_hwmon_read_sensor(sfp, reg, value);
+	if (err < 0)
+		return err;
+
+	sfp_hwmon_calibrate_tx_power(sfp, value);
+
+	return 0;
+}
+
+static int sfp_hwmon_read_rx_power(struct sfp *sfp, int reg, long *value)
+{
+	int err;
+
+	err = sfp_hwmon_read_sensor(sfp, reg, value);
+	if (err < 0)
+		return err;
+
+	sfp_hwmon_to_rx_power(value);
+
+	return 0;
+}
+
+static int sfp_hwmon_temp(struct sfp *sfp, u32 attr, long *value)
+{
+	u8 status;
+	int err;
+
+	switch (attr) {
+	case hwmon_temp_input:
+		return sfp_hwmon_read_temp(sfp, SFP_TEMP, value);
+
+	case hwmon_temp_lcrit:
+		*value = be16_to_cpu(sfp->diag.temp_low_alarm);
+		sfp_hwmon_calibrate_temp(sfp, value);
+		return 0;
+
+	case hwmon_temp_min:
+		*value = be16_to_cpu(sfp->diag.temp_low_warn);
+		sfp_hwmon_calibrate_temp(sfp, value);
+		return 0;
+	case hwmon_temp_max:
+		*value = be16_to_cpu(sfp->diag.temp_high_warn);
+		sfp_hwmon_calibrate_temp(sfp, value);
+		return 0;
+
+	case hwmon_temp_crit:
+		*value = be16_to_cpu(sfp->diag.temp_high_alarm);
+		sfp_hwmon_calibrate_temp(sfp, value);
+		return 0;
+
+	case hwmon_temp_lcrit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TEMP_LOW);
+		return 0;
+
+	case hwmon_temp_min_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TEMP_LOW);
+		return 0;
+
+	case hwmon_temp_max_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TEMP_HIGH);
+		return 0;
+
+	case hwmon_temp_crit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TEMP_HIGH);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int sfp_hwmon_vcc(struct sfp *sfp, u32 attr, long *value)
+{
+	u8 status;
+	int err;
+
+	switch (attr) {
+	case hwmon_in_input:
+		return sfp_hwmon_read_vcc(sfp, SFP_VCC, value);
+
+	case hwmon_in_lcrit:
+		*value = be16_to_cpu(sfp->diag.volt_low_alarm);
+		sfp_hwmon_calibrate_vcc(sfp, value);
+		return 0;
+
+	case hwmon_in_min:
+		*value = be16_to_cpu(sfp->diag.volt_low_warn);
+		sfp_hwmon_calibrate_vcc(sfp, value);
+		return 0;
+
+	case hwmon_in_max:
+		*value = be16_to_cpu(sfp->diag.volt_high_warn);
+		sfp_hwmon_calibrate_vcc(sfp, value);
+		return 0;
+
+	case hwmon_in_crit:
+		*value = be16_to_cpu(sfp->diag.volt_high_alarm);
+		sfp_hwmon_calibrate_vcc(sfp, value);
+		return 0;
+
+	case hwmon_in_lcrit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_VCC_LOW);
+		return 0;
+
+	case hwmon_in_min_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_VCC_LOW);
+		return 0;
+
+	case hwmon_in_max_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_VCC_HIGH);
+		return 0;
+
+	case hwmon_in_crit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_VCC_HIGH);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int sfp_hwmon_bias(struct sfp *sfp, u32 attr, long *value)
+{
+	u8 status;
+	int err;
+
+	switch (attr) {
+	case hwmon_curr_input:
+		return sfp_hwmon_read_bias(sfp, SFP_TX_BIAS, value);
+
+	case hwmon_curr_lcrit:
+		*value = be16_to_cpu(sfp->diag.bias_low_alarm);
+		sfp_hwmon_calibrate_bias(sfp, value);
+		return 0;
+
+	case hwmon_curr_min:
+		*value = be16_to_cpu(sfp->diag.bias_low_warn);
+		sfp_hwmon_calibrate_bias(sfp, value);
+		return 0;
+
+	case hwmon_curr_max:
+		*value = be16_to_cpu(sfp->diag.bias_high_warn);
+		sfp_hwmon_calibrate_bias(sfp, value);
+		return 0;
+
+	case hwmon_curr_crit:
+		*value = be16_to_cpu(sfp->diag.bias_high_alarm);
+		sfp_hwmon_calibrate_bias(sfp, value);
+		return 0;
+
+	case hwmon_curr_lcrit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TX_BIAS_LOW);
+		return 0;
+
+	case hwmon_curr_min_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TX_BIAS_LOW);
+		return 0;
+
+	case hwmon_curr_max_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TX_BIAS_HIGH);
+		return 0;
+
+	case hwmon_curr_crit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TX_BIAS_HIGH);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int sfp_hwmon_tx_power(struct sfp *sfp, u32 attr, long *value)
+{
+	u8 status;
+	int err;
+
+	switch (attr) {
+	case hwmon_power_input:
+		return sfp_hwmon_read_tx_power(sfp, SFP_TX_POWER, value);
+
+	case hwmon_power_lcrit:
+		*value = be16_to_cpu(sfp->diag.txpwr_low_alarm);
+		sfp_hwmon_calibrate_tx_power(sfp, value);
+		return 0;
+
+	case hwmon_power_min:
+		*value = be16_to_cpu(sfp->diag.txpwr_low_warn);
+		sfp_hwmon_calibrate_tx_power(sfp, value);
+		return 0;
+
+	case hwmon_power_max:
+		*value = be16_to_cpu(sfp->diag.txpwr_high_warn);
+		sfp_hwmon_calibrate_tx_power(sfp, value);
+		return 0;
+
+	case hwmon_power_crit:
+		*value = be16_to_cpu(sfp->diag.txpwr_high_alarm);
+		sfp_hwmon_calibrate_tx_power(sfp, value);
+		return 0;
+
+	case hwmon_power_lcrit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TXPWR_LOW);
+		return 0;
+
+	case hwmon_power_min_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TXPWR_LOW);
+		return 0;
+
+	case hwmon_power_max_alarm:
+		err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN0_TXPWR_HIGH);
+		return 0;
+
+	case hwmon_power_crit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM0_TXPWR_HIGH);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int sfp_hwmon_rx_power(struct sfp *sfp, u32 attr, long *value)
+{
+	u8 status;
+	int err;
+
+	switch (attr) {
+	case hwmon_power_input:
+		return sfp_hwmon_read_rx_power(sfp, SFP_RX_POWER, value);
+
+	case hwmon_power_lcrit:
+		*value = be16_to_cpu(sfp->diag.rxpwr_low_alarm);
+		sfp_hwmon_to_rx_power(value);
+		return 0;
+
+	case hwmon_power_min:
+		*value = be16_to_cpu(sfp->diag.rxpwr_low_warn);
+		sfp_hwmon_to_rx_power(value);
+		return 0;
+
+	case hwmon_power_max:
+		*value = be16_to_cpu(sfp->diag.rxpwr_high_warn);
+		sfp_hwmon_to_rx_power(value);
+		return 0;
+
+	case hwmon_power_crit:
+		*value = be16_to_cpu(sfp->diag.rxpwr_high_alarm);
+		sfp_hwmon_to_rx_power(value);
+		return 0;
+
+	case hwmon_power_lcrit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM1, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM1_RXPWR_LOW);
+		return 0;
+
+	case hwmon_power_min_alarm:
+		err = sfp_read(sfp, true, SFP_WARN1, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN1_RXPWR_LOW);
+		return 0;
+
+	case hwmon_power_max_alarm:
+		err = sfp_read(sfp, true, SFP_WARN1, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_WARN1_RXPWR_HIGH);
+		return 0;
+
+	case hwmon_power_crit_alarm:
+		err = sfp_read(sfp, true, SFP_ALARM1, &status, sizeof(status));
+		if (err < 0)
+			return err;
+
+		*value = !!(status & SFP_ALARM1_RXPWR_HIGH);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int sfp_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			  u32 attr, int channel, long *value)
+{
+	struct sfp *sfp = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_temp:
+		return sfp_hwmon_temp(sfp, attr, value);
+	case hwmon_in:
+		return sfp_hwmon_vcc(sfp, attr, value);
+	case hwmon_curr:
+		return sfp_hwmon_bias(sfp, attr, value);
+	case hwmon_power:
+		switch (channel) {
+		case 0:
+			return sfp_hwmon_tx_power(sfp, attr, value);
+		case 1:
+			return sfp_hwmon_rx_power(sfp, attr, value);
+		default:
+			return -EOPNOTSUPP;
+		}
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static const struct hwmon_ops sfp_hwmon_ops = {
+	.is_visible = sfp_hwmon_is_visible,
+	.read = sfp_hwmon_read,
+};
+
+static u32 sfp_hwmon_chip_config[] = {
+	HWMON_C_REGISTER_TZ,
+	0,
+};
+
+static const struct hwmon_channel_info sfp_hwmon_chip = {
+	.type = hwmon_chip,
+	.config = sfp_hwmon_chip_config,
+};
+
+static u32 sfp_hwmon_temp_config[] = {
+	HWMON_T_INPUT |
+	HWMON_T_MAX | HWMON_T_MIN |
+	HWMON_T_MAX_ALARM | HWMON_T_MIN_ALARM |
+	HWMON_T_CRIT | HWMON_T_LCRIT |
+	HWMON_T_CRIT_ALARM | HWMON_T_LCRIT_ALARM,
+	0,
+};
+
+static const struct hwmon_channel_info sfp_hwmon_temp_channel_info = {
+	.type = hwmon_temp,
+	.config = sfp_hwmon_temp_config,
+};
+
+static u32 sfp_hwmon_vcc_config[] = {
+	HWMON_I_INPUT |
+	HWMON_I_MAX | HWMON_I_MIN |
+	HWMON_I_MAX_ALARM | HWMON_I_MIN_ALARM |
+	HWMON_I_CRIT | HWMON_I_LCRIT |
+	HWMON_I_CRIT_ALARM | HWMON_I_LCRIT_ALARM,
+	0,
+};
+
+static const struct hwmon_channel_info sfp_hwmon_vcc_channel_info = {
+	.type = hwmon_in,
+	.config = sfp_hwmon_vcc_config,
+};
+
+static u32 sfp_hwmon_bias_config[] = {
+	HWMON_C_INPUT |
+	HWMON_C_MAX | HWMON_C_MIN |
+	HWMON_C_MAX_ALARM | HWMON_C_MIN_ALARM |
+	HWMON_C_CRIT | HWMON_C_LCRIT |
+	HWMON_C_CRIT_ALARM | HWMON_C_LCRIT_ALARM,
+	0,
+};
+
+static const struct hwmon_channel_info sfp_hwmon_bias_channel_info = {
+	.type = hwmon_curr,
+	.config = sfp_hwmon_bias_config,
+};
+
+static u32 sfp_hwmon_power_config[] = {
+	/* Transmit power */
+	HWMON_P_INPUT |
+	HWMON_P_MAX | HWMON_P_MIN |
+	HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM |
+	HWMON_P_CRIT | HWMON_P_LCRIT |
+	HWMON_P_CRIT_ALARM | HWMON_P_LCRIT_ALARM,
+	/* Receive power */
+	HWMON_P_INPUT |
+	HWMON_P_MAX | HWMON_P_MIN |
+	HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM |
+	HWMON_P_CRIT | HWMON_P_LCRIT |
+	HWMON_P_CRIT_ALARM | HWMON_P_LCRIT_ALARM,
+	0,
+};
+
+static const struct hwmon_channel_info sfp_hwmon_power_channel_info = {
+	.type = hwmon_power,
+	.config = sfp_hwmon_power_config,
+};
+
+static const struct hwmon_channel_info *sfp_hwmon_info[] = {
+	&sfp_hwmon_chip,
+	&sfp_hwmon_vcc_channel_info,
+	&sfp_hwmon_temp_channel_info,
+	&sfp_hwmon_bias_channel_info,
+	&sfp_hwmon_power_channel_info,
+	NULL,
+};
+
+static const struct hwmon_chip_info sfp_hwmon_chip_info = {
+	.ops = &sfp_hwmon_ops,
+	.info = sfp_hwmon_info,
+};
+
+static int sfp_hwmon_insert(struct sfp *sfp)
+{
+	int err, i;
+
+	if (sfp->id.ext.sff8472_compliance == SFP_SFF8472_COMPLIANCE_NONE)
+		return 0;
+
+	if (!(sfp->id.ext.diagmon & SFP_DIAGMON_DDM))
+		return 0;
+
+	if (sfp->id.ext.diagmon & SFP_DIAGMON_ADDRMODE)
+		/* This driver in general does not support address
+		 * change.
+		 */
+		return 0;
+
+	err = sfp_read(sfp, true, 0, &sfp->diag, sizeof(sfp->diag));
+	if (err < 0)
+		return err;
+
+	sfp->hwmon_name = kstrdup(dev_name(sfp->dev), GFP_KERNEL);
+	if (!sfp->hwmon_name)
+		return -ENODEV;
+
+	for (i = 0; sfp->hwmon_name[i]; i++)
+		if (hwmon_is_bad_char(sfp->hwmon_name[i]))
+			sfp->hwmon_name[i] = '_';
+
+	sfp->hwmon_dev = hwmon_device_register_with_info(sfp->dev,
+							 sfp->hwmon_name, sfp,
+							 &sfp_hwmon_chip_info,
+							 NULL);
+
+	return PTR_ERR_OR_ZERO(sfp->hwmon_dev);
+}
+
+static void sfp_hwmon_remove(struct sfp *sfp)
+{
+	hwmon_device_unregister(sfp->hwmon_dev);
+	kfree(sfp->hwmon_name);
+}
+#else
+static int sfp_hwmon_insert(struct sfp *sfp)
+{
+	return 0;
+}
+
+static void sfp_hwmon_remove(struct sfp *sfp)
+{
+}
+#endif
+
 /* Helpers */
 static void sfp_module_tx_disable(struct sfp *sfp)
 {
@@ -636,6 +1357,10 @@ static int sfp_sm_mod_probe(struct sfp *sfp)
 		dev_warn(sfp->dev,
 			 "module address swap to access page 0xA2 is not supported.\n");
 
+	ret = sfp_hwmon_insert(sfp);
+	if (ret < 0)
+		return ret;
+
 	ret = sfp_module_insert(sfp->sfp_bus, &sfp->id);
 	if (ret < 0)
 		return ret;
@@ -647,6 +1372,8 @@ static void sfp_sm_mod_remove(struct sfp *sfp)
 {
 	sfp_module_remove(sfp->sfp_bus);
 
+	sfp_hwmon_remove(sfp);
+
 	if (sfp->mod_phy)
 		sfp_sm_phy_detach(sfp);
 
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index ebce9e24906a..d37518e89db2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -231,6 +231,50 @@ struct sfp_eeprom_id {
 	struct sfp_eeprom_ext ext;
 } __packed;
 
+struct sfp_diag {
+	__be16 temp_high_alarm;
+	__be16 temp_low_alarm;
+	__be16 temp_high_warn;
+	__be16 temp_low_warn;
+	__be16 volt_high_alarm;
+	__be16 volt_low_alarm;
+	__be16 volt_high_warn;
+	__be16 volt_low_warn;
+	__be16 bias_high_alarm;
+	__be16 bias_low_alarm;
+	__be16 bias_high_warn;
+	__be16 bias_low_warn;
+	__be16 txpwr_high_alarm;
+	__be16 txpwr_low_alarm;
+	__be16 txpwr_high_warn;
+	__be16 txpwr_low_warn;
+	__be16 rxpwr_high_alarm;
+	__be16 rxpwr_low_alarm;
+	__be16 rxpwr_high_warn;
+	__be16 rxpwr_low_warn;
+	__be16 laser_temp_high_alarm;
+	__be16 laser_temp_low_alarm;
+	__be16 laser_temp_high_warn;
+	__be16 laser_temp_low_warn;
+	__be16 tec_cur_high_alarm;
+	__be16 tec_cur_low_alarm;
+	__be16 tec_cur_high_warn;
+	__be16 tec_cur_low_warn;
+	__be32 cal_rxpwr4;
+	__be32 cal_rxpwr3;
+	__be32 cal_rxpwr2;
+	__be32 cal_rxpwr1;
+	__be32 cal_rxpwr0;
+	__be16 cal_txi_slope;
+	__be16 cal_txi_offset;
+	__be16 cal_txpwr_slope;
+	__be16 cal_txpwr_offset;
+	__be16 cal_t_slope;
+	__be16 cal_t_offset;
+	__be16 cal_v_slope;
+	__be16 cal_v_offset;
+} __packed;
+
 /* SFP EEPROM registers */
 enum {
 	SFP_PHYS_ID			= 0x00,
@@ -384,7 +428,33 @@ enum {
 	SFP_TEC_CUR			= 0x6c,
 
 	SFP_STATUS			= 0x6e,
-	SFP_ALARM			= 0x70,
+	SFP_ALARM0			= 0x70,
+	SFP_ALARM0_TEMP_HIGH		= BIT(7),
+	SFP_ALARM0_TEMP_LOW		= BIT(6),
+	SFP_ALARM0_VCC_HIGH		= BIT(5),
+	SFP_ALARM0_VCC_LOW		= BIT(4),
+	SFP_ALARM0_TX_BIAS_HIGH		= BIT(3),
+	SFP_ALARM0_TX_BIAS_LOW		= BIT(2),
+	SFP_ALARM0_TXPWR_HIGH		= BIT(1),
+	SFP_ALARM0_TXPWR_LOW		= BIT(0),
+
+	SFP_ALARM1			= 0x71,
+	SFP_ALARM1_RXPWR_HIGH		= BIT(7),
+	SFP_ALARM1_RXPWR_LOW		= BIT(6),
+
+	SFP_WARN0			= 0x74,
+	SFP_WARN0_TEMP_HIGH		= BIT(7),
+	SFP_WARN0_TEMP_LOW		= BIT(6),
+	SFP_WARN0_VCC_HIGH		= BIT(5),
+	SFP_WARN0_VCC_LOW		= BIT(4),
+	SFP_WARN0_TX_BIAS_HIGH		= BIT(3),
+	SFP_WARN0_TX_BIAS_LOW		= BIT(2),
+	SFP_WARN0_TXPWR_HIGH		= BIT(1),
+	SFP_WARN0_TXPWR_LOW		= BIT(0),
+
+	SFP_WARN1			= 0x75,
+	SFP_WARN1_RXPWR_HIGH		= BIT(7),
+	SFP_WARN1_RXPWR_LOW		= BIT(6),
 
 	SFP_EXT_STATUS			= 0x76,
 	SFP_VSL				= 0x78,
-- 
cgit v1.2.3


From 753d433b586d1d43c487e3d660f5778c7c8d58ea Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Fri, 22 Jun 2018 09:15:32 +1000
Subject: random: Return nbytes filled from hw RNG

Currently the function get_random_bytes_arch() has return value 'void'.
If the hw RNG fails we currently fall back to using get_random_bytes().
This defeats the purpose of requesting random material from the hw RNG
in the first place.

There are currently no intree users of get_random_bytes_arch().

Only get random bytes from the hw RNG, make function return the number
of bytes retrieved from the hw RNG.

Acked-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c  | 16 +++++++++-------
 include/linux/random.h |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 8e53e9515a1d..34ddfd57419b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1725,26 +1725,28 @@ EXPORT_SYMBOL(del_random_ready_callback);
  * key known by the NSA).  So it's useful if we need the speed, but
  * only if we're willing to trust the hardware manufacturer not to
  * have put in a back door.
+ *
+ * Return number of bytes filled in.
  */
-void get_random_bytes_arch(void *buf, int nbytes)
+int __must_check get_random_bytes_arch(void *buf, int nbytes)
 {
+	int left = nbytes;
 	char *p = buf;
 
-	trace_get_random_bytes_arch(nbytes, _RET_IP_);
-	while (nbytes) {
+	trace_get_random_bytes_arch(left, _RET_IP_);
+	while (left) {
 		unsigned long v;
-		int chunk = min(nbytes, (int)sizeof(unsigned long));
+		int chunk = min_t(int, left, sizeof(unsigned long));
 
 		if (!arch_get_random_long(&v))
 			break;
 
 		memcpy(p, &v, chunk);
 		p += chunk;
-		nbytes -= chunk;
+		left -= chunk;
 	}
 
-	if (nbytes)
-		get_random_bytes(p, nbytes);
+	return nbytes - left;
 }
 EXPORT_SYMBOL(get_random_bytes_arch);
 
diff --git a/include/linux/random.h b/include/linux/random.h
index 2ddf13b4281e..f1c9bc5cd231 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -38,7 +38,7 @@ extern void get_random_bytes(void *buf, int nbytes);
 extern int wait_for_random_bytes(void);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
-extern void get_random_bytes_arch(void *buf, int nbytes);
+extern int __must_check get_random_bytes_arch(void *buf, int nbytes);
 
 #ifndef MODULE
 extern const struct file_operations random_fops, urandom_fops;
-- 
cgit v1.2.3


From 181ace9e1b2e1c5c2be590bc9603baa65f3a16d8 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <absahu@codeaurora.org>
Date: Wed, 20 Jun 2018 12:57:28 +0530
Subject: mtd: rawnand: helper function for setting up ECC configuration

commit 2c8f8afa7f92 ("mtd: nand: add generic helpers to check,
match, maximize ECC settings") provides generic helpers which
drivers can use for setting up ECC parameters.

Since same board can have different ECC strength nand chips so
following is the logic for setting up ECC strength and ECC step
size, which can be used by most of the drivers.

1. If both ECC step size and ECC strength are already set
   (usually by DT) then just check whether this setting
   is supported by NAND controller.
2. If NAND_ECC_MAXIMIZE is set, then select maximum ECC strength
   supported by NAND controller.
3. Otherwise, try to match the ECC step size and ECC strength closest
   to the chip's requirement. If available OOB size can't fit the chip
   requirement then select maximum ECC strength which can be fit with
   available OOB size.

This patch introduces nand_ecc_choose_conf function which calls the
required helper functions for the above logic. The drivers can use
this single function instead of calling the 3 helper functions
individually.

Signed-off-by: Abhishek Sahu <absahu@codeaurora.org>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/mtd/rawnand.h      |  3 +++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index dbf6e80e9ab5..2858d0b09c1d 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -6295,6 +6295,39 @@ int nand_maximize_ecc(struct nand_chip *chip,
 }
 EXPORT_SYMBOL_GPL(nand_maximize_ecc);
 
+/**
+ * nand_ecc_choose_conf - Set the ECC strength and ECC step size
+ * @chip: nand chip info structure
+ * @caps: ECC engine caps info structure
+ * @oobavail: OOB size that the ECC engine can use
+ *
+ * Choose the ECC configuration according to following logic
+ *
+ * 1. If both ECC step size and ECC strength are already set (usually by DT)
+ *    then check if it is supported by this controller.
+ * 2. If NAND_ECC_MAXIMIZE is set, then select maximum ECC strength.
+ * 3. Otherwise, try to match the ECC step size and ECC strength closest
+ *    to the chip's requirement. If available OOB size can't fit the chip
+ *    requirement then fallback to the maximum ECC step size and ECC strength.
+ *
+ * On success, the chosen ECC settings are set.
+ */
+int nand_ecc_choose_conf(struct nand_chip *chip,
+			 const struct nand_ecc_caps *caps, int oobavail)
+{
+	if (chip->ecc.size && chip->ecc.strength)
+		return nand_check_ecc_caps(chip, caps, oobavail);
+
+	if (chip->ecc.options & NAND_ECC_MAXIMIZE)
+		return nand_maximize_ecc(chip, caps, oobavail);
+
+	if (!nand_match_ecc_req(chip, caps, oobavail))
+		return 0;
+
+	return nand_maximize_ecc(chip, caps, oobavail);
+}
+EXPORT_SYMBOL_GPL(nand_ecc_choose_conf);
+
 /*
  * Check if the chip configuration meet the datasheet requirements.
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index fbac4741da52..2ce305388471 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1672,6 +1672,9 @@ int nand_match_ecc_req(struct nand_chip *chip,
 int nand_maximize_ecc(struct nand_chip *chip,
 		      const struct nand_ecc_caps *caps, int oobavail);
 
+int nand_ecc_choose_conf(struct nand_chip *chip,
+			 const struct nand_ecc_caps *caps, int oobavail);
+
 /* Default write_oob implementation */
 int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
 
-- 
cgit v1.2.3


From 0cf5c7dbaa392d11cfae532cedeaad0ac48b2165 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <absahu@codeaurora.org>
Date: Wed, 20 Jun 2018 12:57:42 +0530
Subject: mtd: rawnand: provide only single helper function for ECC conf

Function nand_ecc_choose_conf() will be help for all the cases, so
other helper functions can be made static.

nand_check_ecc_caps(): Invoke nand_ecc_choose_conf() with
                       both chip->ecc.size and chip->ecc.strength
                       value set.

nand_maximize_ecc(): Invoke nand_ecc_choose_conf() with
                     NAND_ECC_MAXIMIZE flag.

nand_match_ecc_req(): Invoke nand_ecc_choose_conf() with either
                      chip->ecc.size or chip->ecc.strength value
                      set and without NAND_ECC_MAXIMIZE flag.

CC: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Abhishek Sahu <absahu@codeaurora.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 39 +++++++++++++++------------------------
 include/linux/mtd/rawnand.h      |  9 ---------
 2 files changed, 15 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 2858d0b09c1d..faac82b1e058 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -6085,24 +6085,17 @@ static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
  * by the controller and the calculated ECC bytes fit within the chip's OOB.
  * On success, the calculated ECC bytes is set.
  */
-int nand_check_ecc_caps(struct nand_chip *chip,
-			const struct nand_ecc_caps *caps, int oobavail)
+static int
+nand_check_ecc_caps(struct nand_chip *chip,
+		    const struct nand_ecc_caps *caps, int oobavail)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	const struct nand_ecc_step_info *stepinfo;
 	int preset_step = chip->ecc.size;
 	int preset_strength = chip->ecc.strength;
-	int nsteps, ecc_bytes;
+	int ecc_bytes, nsteps = mtd->writesize / preset_step;
 	int i, j;
 
-	if (WARN_ON(oobavail < 0))
-		return -EINVAL;
-
-	if (!preset_step || !preset_strength)
-		return -ENODATA;
-
-	nsteps = mtd->writesize / preset_step;
-
 	for (i = 0; i < caps->nstepinfos; i++) {
 		stepinfo = &caps->stepinfos[i];
 
@@ -6135,7 +6128,6 @@ int nand_check_ecc_caps(struct nand_chip *chip,
 
 	return -ENOTSUPP;
 }
-EXPORT_SYMBOL_GPL(nand_check_ecc_caps);
 
 /**
  * nand_match_ecc_req - meet the chip's requirement with least ECC bytes
@@ -6147,8 +6139,9 @@ EXPORT_SYMBOL_GPL(nand_check_ecc_caps);
  * number of ECC bytes (i.e. with the largest number of OOB-free bytes).
  * On success, the chosen ECC settings are set.
  */
-int nand_match_ecc_req(struct nand_chip *chip,
-		       const struct nand_ecc_caps *caps, int oobavail)
+static int
+nand_match_ecc_req(struct nand_chip *chip,
+		   const struct nand_ecc_caps *caps, int oobavail)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	const struct nand_ecc_step_info *stepinfo;
@@ -6159,9 +6152,6 @@ int nand_match_ecc_req(struct nand_chip *chip,
 	int best_ecc_bytes_total = INT_MAX;
 	int i, j;
 
-	if (WARN_ON(oobavail < 0))
-		return -EINVAL;
-
 	/* No information provided by the NAND chip */
 	if (!req_step || !req_strength)
 		return -ENOTSUPP;
@@ -6220,7 +6210,6 @@ int nand_match_ecc_req(struct nand_chip *chip,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nand_match_ecc_req);
 
 /**
  * nand_maximize_ecc - choose the max ECC strength available
@@ -6231,8 +6220,9 @@ EXPORT_SYMBOL_GPL(nand_match_ecc_req);
  * Choose the max ECC strength that is supported on the controller, and can fit
  * within the chip's OOB.  On success, the chosen ECC settings are set.
  */
-int nand_maximize_ecc(struct nand_chip *chip,
-		      const struct nand_ecc_caps *caps, int oobavail)
+static int
+nand_maximize_ecc(struct nand_chip *chip,
+		  const struct nand_ecc_caps *caps, int oobavail)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	const struct nand_ecc_step_info *stepinfo;
@@ -6242,9 +6232,6 @@ int nand_maximize_ecc(struct nand_chip *chip,
 	int best_strength, best_ecc_bytes;
 	int i, j;
 
-	if (WARN_ON(oobavail < 0))
-		return -EINVAL;
-
 	for (i = 0; i < caps->nstepinfos; i++) {
 		stepinfo = &caps->stepinfos[i];
 		step_size = stepinfo->stepsize;
@@ -6293,7 +6280,6 @@ int nand_maximize_ecc(struct nand_chip *chip,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nand_maximize_ecc);
 
 /**
  * nand_ecc_choose_conf - Set the ECC strength and ECC step size
@@ -6315,6 +6301,11 @@ EXPORT_SYMBOL_GPL(nand_maximize_ecc);
 int nand_ecc_choose_conf(struct nand_chip *chip,
 			 const struct nand_ecc_caps *caps, int oobavail)
 {
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	if (WARN_ON(oobavail < 0 || oobavail > mtd->oobsize))
+		return -EINVAL;
+
 	if (chip->ecc.size && chip->ecc.strength)
 		return nand_check_ecc_caps(chip, caps, oobavail);
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 2ce305388471..0c6fb316b409 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1663,15 +1663,6 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
 				void *extraoob, int extraooblen,
 				int threshold);
 
-int nand_check_ecc_caps(struct nand_chip *chip,
-			const struct nand_ecc_caps *caps, int oobavail);
-
-int nand_match_ecc_req(struct nand_chip *chip,
-		       const struct nand_ecc_caps *caps,  int oobavail);
-
-int nand_maximize_ecc(struct nand_chip *chip,
-		      const struct nand_ecc_caps *caps, int oobavail);
-
 int nand_ecc_choose_conf(struct nand_chip *chip,
 			 const struct nand_ecc_caps *caps, int oobavail);
 
-- 
cgit v1.2.3


From 7529df4652482c33ae1a99ee8189401146f13cb7 Mon Sep 17 00:00:00 2001
From: Peter Pan <peterpandong@micron.com>
Date: Fri, 22 Jun 2018 14:28:23 +0200
Subject: mtd: nand: Add core infrastructure to support SPI NANDs

Add a SPI NAND framework based on the generic NAND framework and the
spi-mem infrastructure.

In its current state, this framework supports the following features:

- single/dual/quad IO modes
- on-die ECC

Signed-off-by: Peter Pan <peterpandong@micron.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/Kconfig      |    1 +
 drivers/mtd/nand/Makefile     |    1 +
 drivers/mtd/nand/spi/Kconfig  |    7 +
 drivers/mtd/nand/spi/Makefile |    3 +
 drivers/mtd/nand/spi/core.c   | 1136 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h   |  416 +++++++++++++++
 include/linux/spi/spi-mem.h   |    4 +-
 7 files changed, 1567 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/Kconfig
 create mode 100644 drivers/mtd/nand/spi/Makefile
 create mode 100644 drivers/mtd/nand/spi/core.c
 create mode 100644 include/linux/mtd/spinand.h

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 88c7d3b4ff8b..9033215e62ea 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -4,3 +4,4 @@ config MTD_NAND_CORE
 source "drivers/mtd/nand/onenand/Kconfig"
 
 source "drivers/mtd/nand/raw/Kconfig"
+source "drivers/mtd/nand/spi/Kconfig"
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 3f0cb87f1a57..7ecd80c0a66e 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_MTD_NAND_CORE) += nandcore.o
 
 obj-y	+= onenand/
 obj-y	+= raw/
+obj-y	+= spi/
diff --git a/drivers/mtd/nand/spi/Kconfig b/drivers/mtd/nand/spi/Kconfig
new file mode 100644
index 000000000000..7c37d2929b68
--- /dev/null
+++ b/drivers/mtd/nand/spi/Kconfig
@@ -0,0 +1,7 @@
+menuconfig MTD_SPI_NAND
+	tristate "SPI NAND device Support"
+	select MTD_NAND_CORE
+	depends on SPI_MASTER
+	select SPI_MEM
+	help
+	  This is the framework for the SPI NAND device drivers.
diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
new file mode 100644
index 000000000000..2c473b765027
--- /dev/null
+++ b/drivers/mtd/nand/spi/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+spinand-objs := core.o
+obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
new file mode 100644
index 000000000000..4efbeb0d85b4
--- /dev/null
+++ b/drivers/mtd/nand/spi/core.c
@@ -0,0 +1,1136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016-2017 Micron Technology, Inc.
+ *
+ * Authors:
+ *	Peter Pan <peterpandong@micron.com>
+ *	Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#define pr_fmt(fmt)	"spi-nand: " fmt
+
+#include <linux/device.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/spinand.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
+
+static void spinand_cache_op_adjust_colum(struct spinand_device *spinand,
+					  const struct nand_page_io_req *req,
+					  u16 *column)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int shift;
+
+	if (nand->memorg.planes_per_lun < 2)
+		return;
+
+	/* The plane number is passed in MSB just above the column address */
+	shift = fls(nand->memorg.pagesize);
+	*column |= req->pos.plane << shift;
+}
+
+static int spinand_read_reg_op(struct spinand_device *spinand, u8 reg, u8 *val)
+{
+	struct spi_mem_op op = SPINAND_GET_FEATURE_OP(reg,
+						      spinand->scratchbuf);
+	int ret;
+
+	ret = spi_mem_exec_op(spinand->spimem, &op);
+	if (ret)
+		return ret;
+
+	*val = *spinand->scratchbuf;
+	return 0;
+}
+
+static int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val)
+{
+	struct spi_mem_op op = SPINAND_SET_FEATURE_OP(reg,
+						      spinand->scratchbuf);
+
+	*spinand->scratchbuf = val;
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int spinand_read_status(struct spinand_device *spinand, u8 *status)
+{
+	return spinand_read_reg_op(spinand, REG_STATUS, status);
+}
+
+static int spinand_get_cfg(struct spinand_device *spinand, u8 *cfg)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+
+	if (WARN_ON(spinand->cur_target < 0 ||
+		    spinand->cur_target >= nand->memorg.ntargets))
+		return -EINVAL;
+
+	*cfg = spinand->cfg_cache[spinand->cur_target];
+	return 0;
+}
+
+static int spinand_set_cfg(struct spinand_device *spinand, u8 cfg)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	int ret;
+
+	if (WARN_ON(spinand->cur_target < 0 ||
+		    spinand->cur_target >= nand->memorg.ntargets))
+		return -EINVAL;
+
+	if (spinand->cfg_cache[spinand->cur_target] == cfg)
+		return 0;
+
+	ret = spinand_write_reg_op(spinand, REG_CFG, cfg);
+	if (ret)
+		return ret;
+
+	spinand->cfg_cache[spinand->cur_target] = cfg;
+	return 0;
+}
+
+/**
+ * spinand_upd_cfg() - Update the configuration register
+ * @spinand: the spinand device
+ * @mask: the mask encoding the bits to update in the config reg
+ * @val: the new value to apply
+ *
+ * Update the configuration register.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val)
+{
+	int ret;
+	u8 cfg;
+
+	ret = spinand_get_cfg(spinand, &cfg);
+	if (ret)
+		return ret;
+
+	cfg &= ~mask;
+	cfg |= val;
+
+	return spinand_set_cfg(spinand, cfg);
+}
+
+/**
+ * spinand_select_target() - Select a specific NAND target/die
+ * @spinand: the spinand device
+ * @target: the target/die to select
+ *
+ * Select a new target/die. If chip only has one die, this function is a NOOP.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int spinand_select_target(struct spinand_device *spinand, unsigned int target)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	int ret;
+
+	if (WARN_ON(target >= nand->memorg.ntargets))
+		return -EINVAL;
+
+	if (spinand->cur_target == target)
+		return 0;
+
+	if (nand->memorg.ntargets == 1) {
+		spinand->cur_target = target;
+		return 0;
+	}
+
+	ret = spinand->select_target(spinand, target);
+	if (ret)
+		return ret;
+
+	spinand->cur_target = target;
+	return 0;
+}
+
+static int spinand_init_cfg_cache(struct spinand_device *spinand)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	struct device *dev = &spinand->spimem->spi->dev;
+	unsigned int target;
+	int ret;
+
+	spinand->cfg_cache = devm_kcalloc(dev,
+					  nand->memorg.ntargets,
+					  sizeof(*spinand->cfg_cache),
+					  GFP_KERNEL);
+	if (!spinand->cfg_cache)
+		return -ENOMEM;
+
+	for (target = 0; target < nand->memorg.ntargets; target++) {
+		ret = spinand_select_target(spinand, target);
+		if (ret)
+			return ret;
+
+		/*
+		 * We use spinand_read_reg_op() instead of spinand_get_cfg()
+		 * here to bypass the config cache.
+		 */
+		ret = spinand_read_reg_op(spinand, REG_CFG,
+					  &spinand->cfg_cache[target]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int spinand_init_quad_enable(struct spinand_device *spinand)
+{
+	bool enable = false;
+
+	if (!(spinand->flags & SPINAND_HAS_QE_BIT))
+		return 0;
+
+	if (spinand->op_templates.read_cache->data.buswidth == 4 ||
+	    spinand->op_templates.write_cache->data.buswidth == 4 ||
+	    spinand->op_templates.update_cache->data.buswidth == 4)
+		enable = true;
+
+	return spinand_upd_cfg(spinand, CFG_QUAD_ENABLE,
+			       enable ? CFG_QUAD_ENABLE : 0);
+}
+
+static int spinand_ecc_enable(struct spinand_device *spinand,
+			      bool enable)
+{
+	return spinand_upd_cfg(spinand, CFG_ECC_ENABLE,
+			       enable ? CFG_ECC_ENABLE : 0);
+}
+
+static int spinand_write_enable_op(struct spinand_device *spinand)
+{
+	struct spi_mem_op op = SPINAND_WR_EN_DIS_OP(true);
+
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int spinand_load_page_op(struct spinand_device *spinand,
+				const struct nand_page_io_req *req)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int row = nanddev_pos_to_row(nand, &req->pos);
+	struct spi_mem_op op = SPINAND_PAGE_READ_OP(row);
+
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int spinand_read_from_cache_op(struct spinand_device *spinand,
+				      const struct nand_page_io_req *req)
+{
+	struct spi_mem_op op = *spinand->op_templates.read_cache;
+	struct nand_device *nand = spinand_to_nand(spinand);
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	struct nand_page_io_req adjreq = *req;
+	unsigned int nbytes = 0;
+	void *buf = NULL;
+	u16 column = 0;
+	int ret;
+
+	if (req->datalen) {
+		adjreq.datalen = nanddev_page_size(nand);
+		adjreq.dataoffs = 0;
+		adjreq.databuf.in = spinand->databuf;
+		buf = spinand->databuf;
+		nbytes = adjreq.datalen;
+	}
+
+	if (req->ooblen) {
+		adjreq.ooblen = nanddev_per_page_oobsize(nand);
+		adjreq.ooboffs = 0;
+		adjreq.oobbuf.in = spinand->oobbuf;
+		nbytes += nanddev_per_page_oobsize(nand);
+		if (!buf) {
+			buf = spinand->oobbuf;
+			column = nanddev_page_size(nand);
+		}
+	}
+
+	spinand_cache_op_adjust_colum(spinand, &adjreq, &column);
+	op.addr.val = column;
+
+	/*
+	 * Some controllers are limited in term of max RX data size. In this
+	 * case, just repeat the READ_CACHE operation after updating the
+	 * column.
+	 */
+	while (nbytes) {
+		op.data.buf.in = buf;
+		op.data.nbytes = nbytes;
+		ret = spi_mem_adjust_op_size(spinand->spimem, &op);
+		if (ret)
+			return ret;
+
+		ret = spi_mem_exec_op(spinand->spimem, &op);
+		if (ret)
+			return ret;
+
+		buf += op.data.nbytes;
+		nbytes -= op.data.nbytes;
+		op.addr.val += op.data.nbytes;
+	}
+
+	if (req->datalen)
+		memcpy(req->databuf.in, spinand->databuf + req->dataoffs,
+		       req->datalen);
+
+	if (req->ooblen) {
+		if (req->mode == MTD_OPS_AUTO_OOB)
+			mtd_ooblayout_get_databytes(mtd, req->oobbuf.in,
+						    spinand->oobbuf,
+						    req->ooboffs,
+						    req->ooblen);
+		else
+			memcpy(req->oobbuf.in, spinand->oobbuf + req->ooboffs,
+			       req->ooblen);
+	}
+
+	return 0;
+}
+
+static int spinand_write_to_cache_op(struct spinand_device *spinand,
+				     const struct nand_page_io_req *req)
+{
+	struct spi_mem_op op = *spinand->op_templates.write_cache;
+	struct nand_device *nand = spinand_to_nand(spinand);
+	struct mtd_info *mtd = nanddev_to_mtd(nand);
+	struct nand_page_io_req adjreq = *req;
+	unsigned int nbytes = 0;
+	void *buf = NULL;
+	u16 column = 0;
+	int ret;
+
+	memset(spinand->databuf, 0xff,
+	       nanddev_page_size(nand) +
+	       nanddev_per_page_oobsize(nand));
+
+	if (req->datalen) {
+		memcpy(spinand->databuf + req->dataoffs, req->databuf.out,
+		       req->datalen);
+		adjreq.dataoffs = 0;
+		adjreq.datalen = nanddev_page_size(nand);
+		adjreq.databuf.out = spinand->databuf;
+		nbytes = adjreq.datalen;
+		buf = spinand->databuf;
+	}
+
+	if (req->ooblen) {
+		if (req->mode == MTD_OPS_AUTO_OOB)
+			mtd_ooblayout_set_databytes(mtd, req->oobbuf.out,
+						    spinand->oobbuf,
+						    req->ooboffs,
+						    req->ooblen);
+		else
+			memcpy(spinand->oobbuf + req->ooboffs, req->oobbuf.out,
+			       req->ooblen);
+
+		adjreq.ooblen = nanddev_per_page_oobsize(nand);
+		adjreq.ooboffs = 0;
+		nbytes += nanddev_per_page_oobsize(nand);
+		if (!buf) {
+			buf = spinand->oobbuf;
+			column = nanddev_page_size(nand);
+		}
+	}
+
+	spinand_cache_op_adjust_colum(spinand, &adjreq, &column);
+
+	op = *spinand->op_templates.write_cache;
+	op.addr.val = column;
+
+	/*
+	 * Some controllers are limited in term of max TX data size. In this
+	 * case, split the operation into one LOAD CACHE and one or more
+	 * LOAD RANDOM CACHE.
+	 */
+	while (nbytes) {
+		op.data.buf.out = buf;
+		op.data.nbytes = nbytes;
+
+		ret = spi_mem_adjust_op_size(spinand->spimem, &op);
+		if (ret)
+			return ret;
+
+		ret = spi_mem_exec_op(spinand->spimem, &op);
+		if (ret)
+			return ret;
+
+		buf += op.data.nbytes;
+		nbytes -= op.data.nbytes;
+		op.addr.val += op.data.nbytes;
+
+		/*
+		 * We need to use the RANDOM LOAD CACHE operation if there's
+		 * more than one iteration, because the LOAD operation resets
+		 * the cache to 0xff.
+		 */
+		if (nbytes) {
+			column = op.addr.val;
+			op = *spinand->op_templates.update_cache;
+			op.addr.val = column;
+		}
+	}
+
+	return 0;
+}
+
+static int spinand_program_op(struct spinand_device *spinand,
+			      const struct nand_page_io_req *req)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int row = nanddev_pos_to_row(nand, &req->pos);
+	struct spi_mem_op op = SPINAND_PROG_EXEC_OP(row);
+
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int spinand_erase_op(struct spinand_device *spinand,
+			    const struct nand_pos *pos)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int row = nanddev_pos_to_row(nand, pos);
+	struct spi_mem_op op = SPINAND_BLK_ERASE_OP(row);
+
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int spinand_wait(struct spinand_device *spinand, u8 *s)
+{
+	unsigned long timeo =  jiffies + msecs_to_jiffies(400);
+	u8 status;
+	int ret;
+
+	do {
+		ret = spinand_read_status(spinand, &status);
+		if (ret)
+			return ret;
+
+		if (!(status & STATUS_BUSY))
+			goto out;
+	} while (time_before(jiffies, timeo));
+
+	/*
+	 * Extra read, just in case the STATUS_READY bit has changed
+	 * since our last check
+	 */
+	ret = spinand_read_status(spinand, &status);
+	if (ret)
+		return ret;
+
+out:
+	if (s)
+		*s = status;
+
+	return status & STATUS_BUSY ? -ETIMEDOUT : 0;
+}
+
+static int spinand_read_id_op(struct spinand_device *spinand, u8 *buf)
+{
+	struct spi_mem_op op = SPINAND_READID_OP(0, spinand->scratchbuf,
+						 SPINAND_MAX_ID_LEN);
+	int ret;
+
+	ret = spi_mem_exec_op(spinand->spimem, &op);
+	if (!ret)
+		memcpy(buf, spinand->scratchbuf, SPINAND_MAX_ID_LEN);
+
+	return ret;
+}
+
+static int spinand_reset_op(struct spinand_device *spinand)
+{
+	struct spi_mem_op op = SPINAND_RESET_OP;
+	int ret;
+
+	ret = spi_mem_exec_op(spinand->spimem, &op);
+	if (ret)
+		return ret;
+
+	return spinand_wait(spinand, NULL);
+}
+
+static int spinand_lock_block(struct spinand_device *spinand, u8 lock)
+{
+	return spinand_write_reg_op(spinand, REG_BLOCK_LOCK, lock);
+}
+
+static int spinand_check_ecc_status(struct spinand_device *spinand, u8 status)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+
+	if (spinand->eccinfo.get_status)
+		return spinand->eccinfo.get_status(spinand, status);
+
+	switch (status & STATUS_ECC_MASK) {
+	case STATUS_ECC_NO_BITFLIPS:
+		return 0;
+
+	case STATUS_ECC_HAS_BITFLIPS:
+		/*
+		 * We have no way to know exactly how many bitflips have been
+		 * fixed, so let's return the maximum possible value so that
+		 * wear-leveling layers move the data immediately.
+		 */
+		return nand->eccreq.strength;
+
+	case STATUS_ECC_UNCOR_ERROR:
+		return -EBADMSG;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static int spinand_read_page(struct spinand_device *spinand,
+			     const struct nand_page_io_req *req,
+			     bool ecc_enabled)
+{
+	u8 status;
+	int ret;
+
+	ret = spinand_load_page_op(spinand, req);
+	if (ret)
+		return ret;
+
+	ret = spinand_wait(spinand, &status);
+	if (ret < 0)
+		return ret;
+
+	ret = spinand_read_from_cache_op(spinand, req);
+	if (ret)
+		return ret;
+
+	if (!ecc_enabled)
+		return 0;
+
+	return spinand_check_ecc_status(spinand, status);
+}
+
+static int spinand_write_page(struct spinand_device *spinand,
+			      const struct nand_page_io_req *req)
+{
+	u8 status;
+	int ret;
+
+	ret = spinand_write_enable_op(spinand);
+	if (ret)
+		return ret;
+
+	ret = spinand_write_to_cache_op(spinand, req);
+	if (ret)
+		return ret;
+
+	ret = spinand_program_op(spinand, req);
+	if (ret)
+		return ret;
+
+	ret = spinand_wait(spinand, &status);
+	if (!ret && (status & STATUS_PROG_FAILED))
+		ret = -EIO;
+
+	return ret;
+}
+
+static int spinand_mtd_read(struct mtd_info *mtd, loff_t from,
+			    struct mtd_oob_ops *ops)
+{
+	struct spinand_device *spinand = mtd_to_spinand(mtd);
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	unsigned int max_bitflips = 0;
+	struct nand_io_iter iter;
+	bool enable_ecc = false;
+	bool ecc_failed = false;
+	int ret = 0;
+
+	if (ops->mode != MTD_OPS_RAW && spinand->eccinfo.ooblayout)
+		enable_ecc = true;
+
+	mutex_lock(&spinand->lock);
+
+	nanddev_io_for_each_page(nand, from, ops, &iter) {
+		ret = spinand_select_target(spinand, iter.req.pos.target);
+		if (ret)
+			break;
+
+		ret = spinand_ecc_enable(spinand, enable_ecc);
+		if (ret)
+			break;
+
+		ret = spinand_read_page(spinand, &iter.req, enable_ecc);
+		if (ret < 0 && ret != -EBADMSG)
+			break;
+
+		if (ret == -EBADMSG) {
+			ecc_failed = true;
+			mtd->ecc_stats.failed++;
+			ret = 0;
+		} else {
+			mtd->ecc_stats.corrected += ret;
+			max_bitflips = max_t(unsigned int, max_bitflips, ret);
+		}
+
+		ops->retlen += iter.req.datalen;
+		ops->oobretlen += iter.req.ooblen;
+	}
+
+	mutex_unlock(&spinand->lock);
+
+	if (ecc_failed && !ret)
+		ret = -EBADMSG;
+
+	return ret ? ret : max_bitflips;
+}
+
+static int spinand_mtd_write(struct mtd_info *mtd, loff_t to,
+			     struct mtd_oob_ops *ops)
+{
+	struct spinand_device *spinand = mtd_to_spinand(mtd);
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	struct nand_io_iter iter;
+	bool enable_ecc = false;
+	int ret = 0;
+
+	if (ops->mode != MTD_OPS_RAW && mtd->ooblayout)
+		enable_ecc = true;
+
+	mutex_lock(&spinand->lock);
+
+	nanddev_io_for_each_page(nand, to, ops, &iter) {
+		ret = spinand_select_target(spinand, iter.req.pos.target);
+		if (ret)
+			break;
+
+		ret = spinand_ecc_enable(spinand, enable_ecc);
+		if (ret)
+			break;
+
+		ret = spinand_write_page(spinand, &iter.req);
+		if (ret)
+			break;
+
+		ops->retlen += iter.req.datalen;
+		ops->oobretlen += iter.req.ooblen;
+	}
+
+	mutex_unlock(&spinand->lock);
+
+	return ret;
+}
+
+static bool spinand_isbad(struct nand_device *nand, const struct nand_pos *pos)
+{
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	struct nand_page_io_req req = {
+		.pos = *pos,
+		.ooblen = 2,
+		.ooboffs = 0,
+		.oobbuf.in = spinand->oobbuf,
+		.mode = MTD_OPS_RAW,
+	};
+
+	memset(spinand->oobbuf, 0, 2);
+	spinand_select_target(spinand, pos->target);
+	spinand_read_page(spinand, &req, false);
+	if (spinand->oobbuf[0] != 0xff || spinand->oobbuf[1] != 0xff)
+		return true;
+
+	return false;
+}
+
+static int spinand_mtd_block_isbad(struct mtd_info *mtd, loff_t offs)
+{
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	struct nand_pos pos;
+	int ret;
+
+	nanddev_offs_to_pos(nand, offs, &pos);
+	mutex_lock(&spinand->lock);
+	ret = nanddev_isbad(nand, &pos);
+	mutex_unlock(&spinand->lock);
+
+	return ret;
+}
+
+static int spinand_markbad(struct nand_device *nand, const struct nand_pos *pos)
+{
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	struct nand_page_io_req req = {
+		.pos = *pos,
+		.ooboffs = 0,
+		.ooblen = 2,
+		.oobbuf.out = spinand->oobbuf,
+	};
+	int ret;
+
+	/* Erase block before marking it bad. */
+	ret = spinand_select_target(spinand, pos->target);
+	if (ret)
+		return ret;
+
+	ret = spinand_write_enable_op(spinand);
+	if (ret)
+		return ret;
+
+	spinand_erase_op(spinand, pos);
+
+	memset(spinand->oobbuf, 0, 2);
+	return spinand_write_page(spinand, &req);
+}
+
+static int spinand_mtd_block_markbad(struct mtd_info *mtd, loff_t offs)
+{
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	struct nand_pos pos;
+	int ret;
+
+	nanddev_offs_to_pos(nand, offs, &pos);
+	mutex_lock(&spinand->lock);
+	ret = nanddev_markbad(nand, &pos);
+	mutex_unlock(&spinand->lock);
+
+	return ret;
+}
+
+static int spinand_erase(struct nand_device *nand, const struct nand_pos *pos)
+{
+	struct spinand_device *spinand = nand_to_spinand(nand);
+	u8 status;
+	int ret;
+
+	ret = spinand_select_target(spinand, pos->target);
+	if (ret)
+		return ret;
+
+	ret = spinand_write_enable_op(spinand);
+	if (ret)
+		return ret;
+
+	ret = spinand_erase_op(spinand, pos);
+	if (ret)
+		return ret;
+
+	ret = spinand_wait(spinand, &status);
+	if (!ret && (status & STATUS_ERASE_FAILED))
+		ret = -EIO;
+
+	return ret;
+}
+
+static int spinand_mtd_erase(struct mtd_info *mtd,
+			     struct erase_info *einfo)
+{
+	struct spinand_device *spinand = mtd_to_spinand(mtd);
+	int ret;
+
+	mutex_lock(&spinand->lock);
+	ret = nanddev_mtd_erase(mtd, einfo);
+	mutex_unlock(&spinand->lock);
+
+	return ret;
+}
+
+static int spinand_mtd_block_isreserved(struct mtd_info *mtd, loff_t offs)
+{
+	struct spinand_device *spinand = mtd_to_spinand(mtd);
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	struct nand_pos pos;
+	int ret;
+
+	nanddev_offs_to_pos(nand, offs, &pos);
+	mutex_lock(&spinand->lock);
+	ret = nanddev_isreserved(nand, &pos);
+	mutex_unlock(&spinand->lock);
+
+	return ret;
+}
+
+static const struct nand_ops spinand_ops = {
+	.erase = spinand_erase,
+	.markbad = spinand_markbad,
+	.isbad = spinand_isbad,
+};
+
+static int spinand_manufacturer_detect(struct spinand_device *spinand)
+{
+	return -ENOTSUPP;
+}
+
+static int spinand_manufacturer_init(struct spinand_device *spinand)
+{
+	if (spinand->manufacturer->ops->init)
+		return spinand->manufacturer->ops->init(spinand);
+
+	return 0;
+}
+
+static void spinand_manufacturer_cleanup(struct spinand_device *spinand)
+{
+	/* Release manufacturer private data */
+	if (spinand->manufacturer->ops->cleanup)
+		return spinand->manufacturer->ops->cleanup(spinand);
+}
+
+static const struct spi_mem_op *
+spinand_select_op_variant(struct spinand_device *spinand,
+			  const struct spinand_op_variants *variants)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int i;
+
+	for (i = 0; i < variants->nops; i++) {
+		struct spi_mem_op op = variants->ops[i];
+		unsigned int nbytes;
+		int ret;
+
+		nbytes = nanddev_per_page_oobsize(nand) +
+			 nanddev_page_size(nand);
+
+		while (nbytes) {
+			op.data.nbytes = nbytes;
+			ret = spi_mem_adjust_op_size(spinand->spimem, &op);
+			if (ret)
+				break;
+
+			if (!spi_mem_supports_op(spinand->spimem, &op))
+				break;
+
+			nbytes -= op.data.nbytes;
+		}
+
+		if (!nbytes)
+			return &variants->ops[i];
+	}
+
+	return NULL;
+}
+
+/**
+ * spinand_match_and_init() - Try to find a match between a device ID and an
+ *			      entry in a spinand_info table
+ * @spinand: SPI NAND object
+ * @table: SPI NAND device description table
+ * @table_size: size of the device description table
+ *
+ * Should be used by SPI NAND manufacturer drivers when they want to find a
+ * match between a device ID retrieved through the READ_ID command and an
+ * entry in the SPI NAND description table. If a match is found, the spinand
+ * object will be initialized with information provided by the matching
+ * spinand_info entry.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+int spinand_match_and_init(struct spinand_device *spinand,
+			   const struct spinand_info *table,
+			   unsigned int table_size, u8 devid)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int i;
+
+	for (i = 0; i < table_size; i++) {
+		const struct spinand_info *info = &table[i];
+		const struct spi_mem_op *op;
+
+		if (devid != info->devid)
+			continue;
+
+		nand->memorg = table[i].memorg;
+		nand->eccreq = table[i].eccreq;
+		spinand->eccinfo = table[i].eccinfo;
+		spinand->flags = table[i].flags;
+		spinand->select_target = table[i].select_target;
+
+		op = spinand_select_op_variant(spinand,
+					       info->op_variants.read_cache);
+		if (!op)
+			return -ENOTSUPP;
+
+		spinand->op_templates.read_cache = op;
+
+		op = spinand_select_op_variant(spinand,
+					       info->op_variants.write_cache);
+		if (!op)
+			return -ENOTSUPP;
+
+		spinand->op_templates.write_cache = op;
+
+		op = spinand_select_op_variant(spinand,
+					       info->op_variants.update_cache);
+		spinand->op_templates.update_cache = op;
+
+		return 0;
+	}
+
+	return -ENOTSUPP;
+}
+
+static int spinand_detect(struct spinand_device *spinand)
+{
+	struct device *dev = &spinand->spimem->spi->dev;
+	struct nand_device *nand = spinand_to_nand(spinand);
+	int ret;
+
+	ret = spinand_reset_op(spinand);
+	if (ret)
+		return ret;
+
+	ret = spinand_read_id_op(spinand, spinand->id.data);
+	if (ret)
+		return ret;
+
+	spinand->id.len = SPINAND_MAX_ID_LEN;
+
+	ret = spinand_manufacturer_detect(spinand);
+	if (ret) {
+		dev_err(dev, "unknown raw ID %*phN\n", SPINAND_MAX_ID_LEN,
+			spinand->id.data);
+		return ret;
+	}
+
+	if (nand->memorg.ntargets > 1 && !spinand->select_target) {
+		dev_err(dev,
+			"SPI NANDs with more than one die must implement ->select_target()\n");
+		return -EINVAL;
+	}
+
+	dev_info(&spinand->spimem->spi->dev,
+		 "%s SPI NAND was found.\n", spinand->manufacturer->name);
+	dev_info(&spinand->spimem->spi->dev,
+		 "%llu MiB, block size: %zu KiB, page size: %zu, OOB size: %u\n",
+		 nanddev_size(nand) >> 20, nanddev_eraseblock_size(nand) >> 10,
+		 nanddev_page_size(nand), nanddev_per_page_oobsize(nand));
+
+	return 0;
+}
+
+static int spinand_noecc_ooblayout_ecc(struct mtd_info *mtd, int section,
+				       struct mtd_oob_region *region)
+{
+	return -ERANGE;
+}
+
+static int spinand_noecc_ooblayout_free(struct mtd_info *mtd, int section,
+					struct mtd_oob_region *region)
+{
+	if (section)
+		return -ERANGE;
+
+	/* Reserve 2 bytes for the BBM. */
+	region->offset = 2;
+	region->length = 62;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops spinand_noecc_ooblayout = {
+	.ecc = spinand_noecc_ooblayout_ecc,
+	.free = spinand_noecc_ooblayout_free,
+};
+
+static int spinand_init(struct spinand_device *spinand)
+{
+	struct device *dev = &spinand->spimem->spi->dev;
+	struct mtd_info *mtd = spinand_to_mtd(spinand);
+	struct nand_device *nand = mtd_to_nanddev(mtd);
+	int ret, i;
+
+	/*
+	 * We need a scratch buffer because the spi_mem interface requires that
+	 * buf passed in spi_mem_op->data.buf be DMA-able.
+	 */
+	spinand->scratchbuf = kzalloc(SPINAND_MAX_ID_LEN, GFP_KERNEL);
+	if (!spinand->scratchbuf)
+		return -ENOMEM;
+
+	ret = spinand_detect(spinand);
+	if (ret)
+		goto err_free_bufs;
+
+	/*
+	 * Use kzalloc() instead of devm_kzalloc() here, because some drivers
+	 * may use this buffer for DMA access.
+	 * Memory allocated by devm_ does not guarantee DMA-safe alignment.
+	 */
+	spinand->databuf = kzalloc(nanddev_page_size(nand) +
+			       nanddev_per_page_oobsize(nand),
+			       GFP_KERNEL);
+	if (!spinand->databuf) {
+		ret = -ENOMEM;
+		goto err_free_bufs;
+	}
+
+	spinand->oobbuf = spinand->databuf + nanddev_page_size(nand);
+
+	ret = spinand_init_cfg_cache(spinand);
+	if (ret)
+		goto err_free_bufs;
+
+	ret = spinand_init_quad_enable(spinand);
+	if (ret)
+		goto err_free_bufs;
+
+	ret = spinand_upd_cfg(spinand, CFG_OTP_ENABLE, 0);
+	if (ret)
+		goto err_free_bufs;
+
+	ret = spinand_manufacturer_init(spinand);
+	if (ret) {
+		dev_err(dev,
+			"Failed to initialize the SPI NAND chip (err = %d)\n",
+			ret);
+		goto err_free_bufs;
+	}
+
+	/* After power up, all blocks are locked, so unlock them here. */
+	for (i = 0; i < nand->memorg.ntargets; i++) {
+		ret = spinand_select_target(spinand, i);
+		if (ret)
+			goto err_free_bufs;
+
+		ret = spinand_lock_block(spinand, BL_ALL_UNLOCKED);
+		if (ret)
+			goto err_free_bufs;
+	}
+
+	ret = nanddev_init(nand, &spinand_ops, THIS_MODULE);
+	if (ret)
+		goto err_manuf_cleanup;
+
+	/*
+	 * Right now, we don't support ECC, so let the whole oob
+	 * area is available for user.
+	 */
+	mtd->_read_oob = spinand_mtd_read;
+	mtd->_write_oob = spinand_mtd_write;
+	mtd->_block_isbad = spinand_mtd_block_isbad;
+	mtd->_block_markbad = spinand_mtd_block_markbad;
+	mtd->_block_isreserved = spinand_mtd_block_isreserved;
+	mtd->_erase = spinand_mtd_erase;
+
+	if (spinand->eccinfo.ooblayout)
+		mtd_set_ooblayout(mtd, spinand->eccinfo.ooblayout);
+	else
+		mtd_set_ooblayout(mtd, &spinand_noecc_ooblayout);
+
+	ret = mtd_ooblayout_count_freebytes(mtd);
+	if (ret < 0)
+		goto err_cleanup_nanddev;
+
+	mtd->oobavail = ret;
+
+	return 0;
+
+err_cleanup_nanddev:
+	nanddev_cleanup(nand);
+
+err_manuf_cleanup:
+	spinand_manufacturer_cleanup(spinand);
+
+err_free_bufs:
+	kfree(spinand->databuf);
+	kfree(spinand->scratchbuf);
+	return ret;
+}
+
+static void spinand_cleanup(struct spinand_device *spinand)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+
+	nanddev_cleanup(nand);
+	spinand_manufacturer_cleanup(spinand);
+	kfree(spinand->databuf);
+	kfree(spinand->scratchbuf);
+}
+
+static int spinand_probe(struct spi_mem *mem)
+{
+	struct spinand_device *spinand;
+	struct mtd_info *mtd;
+	int ret;
+
+	spinand = devm_kzalloc(&mem->spi->dev, sizeof(*spinand),
+			       GFP_KERNEL);
+	if (!spinand)
+		return -ENOMEM;
+
+	spinand->spimem = mem;
+	spi_mem_set_drvdata(mem, spinand);
+	spinand_set_of_node(spinand, mem->spi->dev.of_node);
+	mutex_init(&spinand->lock);
+	mtd = spinand_to_mtd(spinand);
+	mtd->dev.parent = &mem->spi->dev;
+
+	ret = spinand_init(spinand);
+	if (ret)
+		return ret;
+
+	ret = mtd_device_register(mtd, NULL, 0);
+	if (ret)
+		goto err_spinand_cleanup;
+
+	return 0;
+
+err_spinand_cleanup:
+	spinand_cleanup(spinand);
+
+	return ret;
+}
+
+static int spinand_remove(struct spi_mem *mem)
+{
+	struct spinand_device *spinand;
+	struct mtd_info *mtd;
+	int ret;
+
+	spinand = spi_mem_get_drvdata(mem);
+	mtd = spinand_to_mtd(spinand);
+
+	ret = mtd_device_unregister(mtd);
+	if (ret)
+		return ret;
+
+	spinand_cleanup(spinand);
+
+	return 0;
+}
+
+static const struct spi_device_id spinand_ids[] = {
+	{ .name = "spi-nand" },
+	{ /* sentinel */ },
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id spinand_of_ids[] = {
+	{ .compatible = "spi-nand" },
+	{ /* sentinel */ },
+};
+#endif
+
+static struct spi_mem_driver spinand_drv = {
+	.spidrv = {
+		.id_table = spinand_ids,
+		.driver = {
+			.name = "spi-nand",
+			.of_match_table = of_match_ptr(spinand_of_ids),
+		},
+	},
+	.probe = spinand_probe,
+	.remove = spinand_remove,
+};
+module_spi_mem_driver(spinand_drv);
+
+MODULE_DESCRIPTION("SPI NAND framework");
+MODULE_AUTHOR("Peter Pan<peterpandong@micron.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
new file mode 100644
index 000000000000..d3efe62dc9de
--- /dev/null
+++ b/include/linux/mtd/spinand.h
@@ -0,0 +1,416 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2016-2017 Micron Technology, Inc.
+ *
+ *  Authors:
+ *	Peter Pan <peterpandong@micron.com>
+ */
+#ifndef __LINUX_MTD_SPINAND_H
+#define __LINUX_MTD_SPINAND_H
+
+#include <linux/mutex.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/spi-mem.h>
+
+/**
+ * Standard SPI NAND flash operations
+ */
+
+#define SPINAND_RESET_OP						\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xff, 1),				\
+		   SPI_MEM_OP_NO_ADDR,					\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_NO_DATA)
+
+#define SPINAND_WR_EN_DIS_OP(enable)					\
+	SPI_MEM_OP(SPI_MEM_OP_CMD((enable) ? 0x06 : 0x04, 1),		\
+		   SPI_MEM_OP_NO_ADDR,					\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_NO_DATA)
+
+#define SPINAND_READID_OP(ndummy, buf, len)				\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x9f, 1),				\
+		   SPI_MEM_OP_NO_ADDR,					\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+
+#define SPINAND_SET_FEATURE_OP(reg, valptr)				\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x1f, 1),				\
+		   SPI_MEM_OP_ADDR(1, reg, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_DATA_OUT(1, valptr, 1))
+
+#define SPINAND_GET_FEATURE_OP(reg, valptr)				\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x0f, 1),				\
+		   SPI_MEM_OP_ADDR(1, reg, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_DATA_IN(1, valptr, 1))
+
+#define SPINAND_BLK_ERASE_OP(addr)					\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xd8, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_NO_DATA)
+
+#define SPINAND_PAGE_READ_OP(addr)					\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x13, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_NO_DATA)
+
+#define SPINAND_PAGE_READ_FROM_CACHE_OP(fast, addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1),		\
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+
+#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
+#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
+#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
+		   SPI_MEM_OP_ADDR(2, addr, 2),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
+#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
+		   SPI_MEM_OP_ADDR(2, addr, 4),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
+#define SPINAND_PROG_EXEC_OP(addr)					\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_NO_DATA)
+
+#define SPINAND_PROG_LOAD(reset, addr, buf, len)			\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(reset ? 0x02 : 0x84, 1),		\
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_DATA_OUT(len, buf, 1))
+
+#define SPINAND_PROG_LOAD_X4(reset, addr, buf, len)			\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(reset ? 0x32 : 0x34, 1),		\
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_NO_DUMMY,					\
+		   SPI_MEM_OP_DATA_OUT(len, buf, 4))
+
+/**
+ * Standard SPI NAND flash commands
+ */
+#define SPINAND_CMD_PROG_LOAD_X4		0x32
+#define SPINAND_CMD_PROG_LOAD_RDM_DATA_X4	0x34
+
+/* feature register */
+#define REG_BLOCK_LOCK		0xa0
+#define BL_ALL_UNLOCKED		0x00
+
+/* configuration register */
+#define REG_CFG			0xb0
+#define CFG_OTP_ENABLE		BIT(6)
+#define CFG_ECC_ENABLE		BIT(4)
+#define CFG_QUAD_ENABLE		BIT(0)
+
+/* status register */
+#define REG_STATUS		0xc0
+#define STATUS_BUSY		BIT(0)
+#define STATUS_ERASE_FAILED	BIT(2)
+#define STATUS_PROG_FAILED	BIT(3)
+#define STATUS_ECC_MASK		GENMASK(5, 4)
+#define STATUS_ECC_NO_BITFLIPS	(0 << 4)
+#define STATUS_ECC_HAS_BITFLIPS	(1 << 4)
+#define STATUS_ECC_UNCOR_ERROR	(2 << 4)
+
+struct spinand_op;
+struct spinand_device;
+
+#define SPINAND_MAX_ID_LEN	4
+
+/**
+ * struct spinand_id - SPI NAND id structure
+ * @data: buffer containing the id bytes. Currently 4 bytes large, but can
+ *	  be extended if required
+ * @len: ID length
+ *
+ * struct_spinand_id->data contains all bytes returned after a READ_ID command,
+ * including dummy bytes if the chip does not emit ID bytes right after the
+ * READ_ID command. The responsibility to extract real ID bytes is left to
+ * struct_manufacurer_ops->detect().
+ */
+struct spinand_id {
+	u8 data[SPINAND_MAX_ID_LEN];
+	int len;
+};
+
+/**
+ * struct manufacurer_ops - SPI NAND manufacturer specific operations
+ * @detect: detect a SPI NAND device. Every time a SPI NAND device is probed
+ *	    the core calls the struct_manufacurer_ops->detect() hook of each
+ *	    registered manufacturer until one of them return 1. Note that
+ *	    the first thing to check in this hook is that the manufacturer ID
+ *	    in struct_spinand_device->id matches the manufacturer whose
+ *	    ->detect() hook has been called. Should return 1 if there's a
+ *	    match, 0 if the manufacturer ID does not match and a negative
+ *	    error code otherwise. When true is returned, the core assumes
+ *	    that properties of the NAND chip (spinand->base.memorg and
+ *	    spinand->base.eccreq) have been filled
+ * @init: initialize a SPI NAND device
+ * @cleanup: cleanup a SPI NAND device
+ *
+ * Each SPI NAND manufacturer driver should implement this interface so that
+ * NAND chips coming from this vendor can be detected and initialized properly.
+ */
+struct spinand_manufacturer_ops {
+	int (*detect)(struct spinand_device *spinand);
+	int (*init)(struct spinand_device *spinand);
+	void (*cleanup)(struct spinand_device *spinand);
+};
+
+/**
+ * struct spinand_manufacturer - SPI NAND manufacturer instance
+ * @id: manufacturer ID
+ * @name: manufacturer name
+ * @ops: manufacturer operations
+ */
+struct spinand_manufacturer {
+	u8 id;
+	char *name;
+	const struct spinand_manufacturer_ops *ops;
+};
+
+/**
+ * struct spinand_op_variants - SPI NAND operation variants
+ * @ops: the list of variants for a given operation
+ * @nops: the number of variants
+ *
+ * Some operations like read-from-cache/write-to-cache have several variants
+ * depending on the number of IO lines you use to transfer data or address
+ * cycles. This structure is a way to describe the different variants supported
+ * by a chip and let the core pick the best one based on the SPI mem controller
+ * capabilities.
+ */
+struct spinand_op_variants {
+	const struct spi_mem_op *ops;
+	unsigned int nops;
+};
+
+#define SPINAND_OP_VARIANTS(name, ...)					\
+	const struct spinand_op_variants name = {			\
+		.ops = (struct spi_mem_op[]) { __VA_ARGS__ },		\
+		.nops = sizeof((struct spi_mem_op[]){ __VA_ARGS__ }) /	\
+			sizeof(struct spi_mem_op),			\
+	}
+
+/**
+ * spinand_ecc_info - description of the on-die ECC implemented by a SPI NAND
+ *		      chip
+ * @get_status: get the ECC status. Should return a positive number encoding
+ *		the number of corrected bitflips if correction was possible or
+ *		-EBADMSG if there are uncorrectable errors. I can also return
+ *		other negative error codes if the error is not caused by
+ *		uncorrectable bitflips
+ * @ooblayout: the OOB layout used by the on-die ECC implementation
+ */
+struct spinand_ecc_info {
+	int (*get_status)(struct spinand_device *spinand, u8 status);
+	const struct mtd_ooblayout_ops *ooblayout;
+};
+
+#define SPINAND_HAS_QE_BIT		BIT(0)
+
+/**
+ * struct spinand_info - Structure used to describe SPI NAND chips
+ * @model: model name
+ * @devid: device ID
+ * @flags: OR-ing of the SPINAND_XXX flags
+ * @memorg: memory organization
+ * @eccreq: ECC requirements
+ * @eccinfo: on-die ECC info
+ * @op_variants: operations variants
+ * @op_variants.read_cache: variants of the read-cache operation
+ * @op_variants.write_cache: variants of the write-cache operation
+ * @op_variants.update_cache: variants of the update-cache operation
+ * @select_target: function used to select a target/die. Required only for
+ *		   multi-die chips
+ *
+ * Each SPI NAND manufacturer driver should have a spinand_info table
+ * describing all the chips supported by the driver.
+ */
+struct spinand_info {
+	const char *model;
+	u8 devid;
+	u32 flags;
+	struct nand_memory_organization memorg;
+	struct nand_ecc_req eccreq;
+	struct spinand_ecc_info eccinfo;
+	struct {
+		const struct spinand_op_variants *read_cache;
+		const struct spinand_op_variants *write_cache;
+		const struct spinand_op_variants *update_cache;
+	} op_variants;
+	int (*select_target)(struct spinand_device *spinand,
+			     unsigned int target);
+};
+
+#define SPINAND_INFO_OP_VARIANTS(__read, __write, __update)		\
+	{								\
+		.read_cache = __read,					\
+		.write_cache = __write,					\
+		.update_cache = __update,				\
+	}
+
+#define SPINAND_ECCINFO(__ooblayout, __get_status)			\
+	.eccinfo = {							\
+		.ooblayout = __ooblayout,				\
+		.get_status = __get_status,				\
+	}
+
+#define SPINAND_SELECT_TARGET(__func)					\
+	.select_target = __func,
+
+#define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants,	\
+		     __flags, ...)					\
+	{								\
+		.model = __model,					\
+		.devid = __id,						\
+		.memorg = __memorg,					\
+		.eccreq = __eccreq,					\
+		.op_variants = __op_variants,				\
+		.flags = __flags,					\
+		__VA_ARGS__						\
+	}
+
+/**
+ * struct spinand_device - SPI NAND device instance
+ * @base: NAND device instance
+ * @spimem: pointer to the SPI mem object
+ * @lock: lock used to serialize accesses to the NAND
+ * @id: NAND ID as returned by READ_ID
+ * @flags: NAND flags
+ * @op_templates: various SPI mem op templates
+ * @op_templates.read_cache: read cache op template
+ * @op_templates.write_cache: write cache op template
+ * @op_templates.update_cache: update cache op template
+ * @select_target: select a specific target/die. Usually called before sending
+ *		   a command addressing a page or an eraseblock embedded in
+ *		   this die. Only required if your chip exposes several dies
+ * @cur_target: currently selected target/die
+ * @eccinfo: on-die ECC information
+ * @cfg_cache: config register cache. One entry per die
+ * @databuf: bounce buffer for data
+ * @oobbuf: bounce buffer for OOB data
+ * @scratchbuf: buffer used for everything but page accesses. This is needed
+ *		because the spi-mem interface explicitly requests that buffers
+ *		passed in spi_mem_op be DMA-able, so we can't based the bufs on
+ *		the stack
+ * @manufacturer: SPI NAND manufacturer information
+ * @priv: manufacturer private data
+ */
+struct spinand_device {
+	struct nand_device base;
+	struct spi_mem *spimem;
+	struct mutex lock;
+	struct spinand_id id;
+	u32 flags;
+
+	struct {
+		const struct spi_mem_op *read_cache;
+		const struct spi_mem_op *write_cache;
+		const struct spi_mem_op *update_cache;
+	} op_templates;
+
+	int (*select_target)(struct spinand_device *spinand,
+			     unsigned int target);
+	unsigned int cur_target;
+
+	struct spinand_ecc_info eccinfo;
+
+	u8 *cfg_cache;
+	u8 *databuf;
+	u8 *oobbuf;
+	u8 *scratchbuf;
+	const struct spinand_manufacturer *manufacturer;
+	void *priv;
+};
+
+/**
+ * mtd_to_spinand() - Get the SPI NAND device attached to an MTD instance
+ * @mtd: MTD instance
+ *
+ * Return: the SPI NAND device attached to @mtd.
+ */
+static inline struct spinand_device *mtd_to_spinand(struct mtd_info *mtd)
+{
+	return container_of(mtd_to_nanddev(mtd), struct spinand_device, base);
+}
+
+/**
+ * spinand_to_mtd() - Get the MTD device embedded in a SPI NAND device
+ * @spinand: SPI NAND device
+ *
+ * Return: the MTD device embedded in @spinand.
+ */
+static inline struct mtd_info *spinand_to_mtd(struct spinand_device *spinand)
+{
+	return nanddev_to_mtd(&spinand->base);
+}
+
+/**
+ * nand_to_spinand() - Get the SPI NAND device embedding an NAND object
+ * @nand: NAND object
+ *
+ * Return: the SPI NAND device embedding @nand.
+ */
+static inline struct spinand_device *nand_to_spinand(struct nand_device *nand)
+{
+	return container_of(nand, struct spinand_device, base);
+}
+
+/**
+ * spinand_to_nand() - Get the NAND device embedded in a SPI NAND object
+ * @spinand: SPI NAND device
+ *
+ * Return: the NAND device embedded in @spinand.
+ */
+static inline struct nand_device *
+spinand_to_nand(struct spinand_device *spinand)
+{
+	return &spinand->base;
+}
+
+/**
+ * spinand_set_of_node - Attach a DT node to a SPI NAND device
+ * @spinand: SPI NAND device
+ * @np: DT node
+ *
+ * Attach a DT node to a SPI NAND device.
+ */
+static inline void spinand_set_of_node(struct spinand_device *spinand,
+				       struct device_node *np)
+{
+	nanddev_set_of_node(&spinand->base, np);
+}
+
+int spinand_match_and_init(struct spinand_device *dev,
+			   const struct spinand_info *table,
+			   unsigned int table_size, u8 devid);
+
+int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
+int spinand_select_target(struct spinand_device *spinand, unsigned int target);
+
+#endif /* __LINUX_MTD_SPINAND_H */
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index bb4bd15ae1f6..4fa34a227a0f 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -3,7 +3,9 @@
  * Copyright (C) 2018 Exceet Electronics GmbH
  * Copyright (C) 2018 Bootlin
  *
- * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ * Author:
+ *	Peter Pan <peterpandong@micron.com>
+ *	Boris Brezillon <boris.brezillon@bootlin.com>
  */
 
 #ifndef __LINUX_SPI_MEM_H
-- 
cgit v1.2.3


From a508e8875e135d7a1df26d8131b5443cb07005ff Mon Sep 17 00:00:00 2001
From: Peter Pan <peterpandong@micron.com>
Date: Fri, 22 Jun 2018 14:28:25 +0200
Subject: mtd: spinand: Add initial support for Micron MT29F2G01ABAGD

Add a basic driver for Micron SPI NANDs. Only one device is supported
right now, but the driver will be extended to support more devices
afterwards.

Signed-off-by: Peter Pan <peterpandong@micron.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile |   2 +-
 drivers/mtd/nand/spi/core.c   |  17 ++++++
 drivers/mtd/nand/spi/micron.c | 133 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h   |   3 +
 4 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/micron.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 2c473b765027..a1df25398f20 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o
+spinand-objs := core.o micron.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4efbeb0d85b4..9cc502d7e745 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -763,8 +763,25 @@ static const struct nand_ops spinand_ops = {
 	.isbad = spinand_isbad,
 };
 
+static const struct spinand_manufacturer *spinand_manufacturers[] = {
+	&micron_spinand_manufacturer,
+};
+
 static int spinand_manufacturer_detect(struct spinand_device *spinand)
 {
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(spinand_manufacturers); i++) {
+		ret = spinand_manufacturers[i]->ops->detect(spinand);
+		if (ret > 0) {
+			spinand->manufacturer = spinand_manufacturers[i];
+			return 0;
+		} else if (ret < 0) {
+			return ret;
+		}
+	}
+
 	return -ENOTSUPP;
 }
 
diff --git a/drivers/mtd/nand/spi/micron.c b/drivers/mtd/nand/spi/micron.c
new file mode 100644
index 000000000000..9c4381d6847b
--- /dev/null
+++ b/drivers/mtd/nand/spi/micron.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016-2017 Micron Technology, Inc.
+ *
+ * Authors:
+ *	Peter Pan <peterpandong@micron.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+#define SPINAND_MFR_MICRON		0x2c
+
+#define MICRON_STATUS_ECC_MASK		GENMASK(7, 4)
+#define MICRON_STATUS_ECC_NO_BITFLIPS	(0 << 4)
+#define MICRON_STATUS_ECC_1TO3_BITFLIPS	(1 << 4)
+#define MICRON_STATUS_ECC_4TO6_BITFLIPS	(3 << 4)
+#define MICRON_STATUS_ECC_7TO8_BITFLIPS	(5 << 4)
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+static int mt29f2g01abagd_ooblayout_ecc(struct mtd_info *mtd, int section,
+					struct mtd_oob_region *region)
+{
+	if (section)
+		return -ERANGE;
+
+	region->offset = 64;
+	region->length = 64;
+
+	return 0;
+}
+
+static int mt29f2g01abagd_ooblayout_free(struct mtd_info *mtd, int section,
+					 struct mtd_oob_region *region)
+{
+	if (section)
+		return -ERANGE;
+
+	/* Reserve 2 bytes for the BBM. */
+	region->offset = 2;
+	region->length = 62;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops mt29f2g01abagd_ooblayout = {
+	.ecc = mt29f2g01abagd_ooblayout_ecc,
+	.free = mt29f2g01abagd_ooblayout_free,
+};
+
+static int mt29f2g01abagd_ecc_get_status(struct spinand_device *spinand,
+					 u8 status)
+{
+	switch (status & MICRON_STATUS_ECC_MASK) {
+	case STATUS_ECC_NO_BITFLIPS:
+		return 0;
+
+	case STATUS_ECC_UNCOR_ERROR:
+		return -EBADMSG;
+
+	case MICRON_STATUS_ECC_1TO3_BITFLIPS:
+		return 3;
+
+	case MICRON_STATUS_ECC_4TO6_BITFLIPS:
+		return 6;
+
+	case MICRON_STATUS_ECC_7TO8_BITFLIPS:
+		return 8;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct spinand_info micron_spinand_table[] = {
+	SPINAND_INFO("MT29F2G01ABAGD", 0x24,
+		     NAND_MEMORG(1, 2048, 128, 64, 2048, 2, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&mt29f2g01abagd_ooblayout,
+				     mt29f2g01abagd_ecc_get_status)),
+};
+
+static int micron_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/*
+	 * Micron SPI NAND read ID need a dummy byte,
+	 * so the first byte in raw_id is dummy.
+	 */
+	if (id[1] != SPINAND_MFR_MICRON)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, micron_spinand_table,
+				     ARRAY_SIZE(micron_spinand_table), id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static const struct spinand_manufacturer_ops micron_spinand_manuf_ops = {
+	.detect = micron_spinand_detect,
+};
+
+const struct spinand_manufacturer micron_spinand_manufacturer = {
+	.id = SPINAND_MFR_MICRON,
+	.name = "Micron",
+	.ops = &micron_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index d3efe62dc9de..717b272940f1 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -193,6 +193,9 @@ struct spinand_manufacturer {
 	const struct spinand_manufacturer_ops *ops;
 };
 
+/* SPI NAND manufacturers */
+extern const struct spinand_manufacturer micron_spinand_manufacturer;
+
 /**
  * struct spinand_op_variants - SPI NAND operation variants
  * @ops: the list of variants for a given operation
-- 
cgit v1.2.3


From 1075492bb9e26312bc8ddeec1a93e2de5f9c76b4 Mon Sep 17 00:00:00 2001
From: Frieder Schrempf <frieder.schrempf@exceet.de>
Date: Fri, 22 Jun 2018 14:28:26 +0200
Subject: mtd: spinand: Add initial support for Winbond W25M02GV

Add support for the W25M02GV chip.

Signed-off-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile  |   2 +-
 drivers/mtd/nand/spi/core.c    |   1 +
 drivers/mtd/nand/spi/winbond.c | 141 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h    |   1 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/winbond.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index a1df25398f20..100008d202ed 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o micron.o
+spinand-objs := core.o micron.o winbond.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 9cc502d7e745..4803a6bfe8ec 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -765,6 +765,7 @@ static const struct nand_ops spinand_ops = {
 
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&micron_spinand_manufacturer,
+	&winbond_spinand_manufacturer,
 };
 
 static int spinand_manufacturer_detect(struct spinand_device *spinand)
diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
new file mode 100644
index 000000000000..67baa1b32c00
--- /dev/null
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017 exceet electronics GmbH
+ *
+ * Authors:
+ *	Frieder Schrempf <frieder.schrempf@exceet.de>
+ *	Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+#define SPINAND_MFR_WINBOND		0xEF
+
+#define WINBOND_CFG_BUF_READ		BIT(3)
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+static int w25m02gv_ooblayout_ecc(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section) + 8;
+	region->length = 8;
+
+	return 0;
+}
+
+static int w25m02gv_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section) + 2;
+	region->length = 6;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops w25m02gv_ooblayout = {
+	.ecc = w25m02gv_ooblayout_ecc,
+	.free = w25m02gv_ooblayout_free,
+};
+
+static int w25m02gv_select_target(struct spinand_device *spinand,
+				  unsigned int target)
+{
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(0xc2, 1),
+					  SPI_MEM_OP_NO_ADDR,
+					  SPI_MEM_OP_NO_DUMMY,
+					  SPI_MEM_OP_DATA_OUT(1,
+							spinand->scratchbuf,
+							1));
+
+	*spinand->scratchbuf = target;
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static const struct spinand_info winbond_spinand_table[] = {
+	SPINAND_INFO("W25M02GV", 0xAB,
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 1, 1, 2),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL),
+		     SPINAND_SELECT_TARGET(w25m02gv_select_target)),
+};
+
+/**
+ * winbond_spinand_detect - initialize device related part in spinand_device
+ * struct if it is a Winbond device.
+ * @spinand: SPI NAND device structure
+ */
+static int winbond_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/*
+	 * Winbond SPI NAND read ID need a dummy byte,
+	 * so the first byte in raw_id is dummy.
+	 */
+	if (id[1] != SPINAND_MFR_WINBOND)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, winbond_spinand_table,
+				     ARRAY_SIZE(winbond_spinand_table), id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static int winbond_spinand_init(struct spinand_device *spinand)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	unsigned int i;
+
+	/*
+	 * Make sure all dies are in buffer read mode and not continuous read
+	 * mode.
+	 */
+	for (i = 0; i < nand->memorg.ntargets; i++) {
+		spinand_select_target(spinand, i);
+		spinand_upd_cfg(spinand, WINBOND_CFG_BUF_READ,
+				WINBOND_CFG_BUF_READ);
+	}
+
+	return 0;
+}
+
+static const struct spinand_manufacturer_ops winbond_spinand_manuf_ops = {
+	.detect = winbond_spinand_detect,
+	.init = winbond_spinand_init,
+};
+
+const struct spinand_manufacturer winbond_spinand_manufacturer = {
+	.id = SPINAND_MFR_WINBOND,
+	.name = "Winbond",
+	.ops = &winbond_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 717b272940f1..f0f16b9029e7 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -195,6 +195,7 @@ struct spinand_manufacturer {
 
 /* SPI NAND manufacturers */
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
+extern const struct spinand_manufacturer winbond_spinand_manufacturer;
 
 /**
  * struct spinand_op_variants - SPI NAND operation variants
-- 
cgit v1.2.3


From b02308af05e62c7d995f4fc75b0bc2ae3c3026f7 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Fri, 22 Jun 2018 14:28:27 +0200
Subject: mtd: spinand: Add initial support for the MX35LF1GE4AB chip

Add minimal support for the MX35LF1GE4AB SPI NAND chip.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile   |   2 +-
 drivers/mtd/nand/spi/core.c     |   1 +
 drivers/mtd/nand/spi/macronix.c | 136 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h     |   1 +
 4 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/macronix.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 100008d202ed..b74e074b363a 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o micron.o winbond.o
+spinand-objs := core.o macronix.o micron.o winbond.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4803a6bfe8ec..30f83649c481 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -764,6 +764,7 @@ static const struct nand_ops spinand_ops = {
 };
 
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
+	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
 	&winbond_spinand_manufacturer,
 };
diff --git a/drivers/mtd/nand/spi/macronix.c b/drivers/mtd/nand/spi/macronix.c
new file mode 100644
index 000000000000..8ba3489d332a
--- /dev/null
+++ b/drivers/mtd/nand/spi/macronix.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Macronix
+ *
+ * Author: Boris Brezillon <boris.brezillon@bootlin.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+#define SPINAND_MFR_MACRONIX		0xC2
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+static int mx35lfxge4ab_ooblayout_ecc(struct mtd_info *mtd, int section,
+				      struct mtd_oob_region *region)
+{
+	return -ERANGE;
+}
+
+static int mx35lfxge4ab_ooblayout_free(struct mtd_info *mtd, int section,
+				       struct mtd_oob_region *region)
+{
+	if (section)
+		return -ERANGE;
+
+	region->offset = 2;
+	region->length = mtd->oobsize - 2;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops mx35lfxge4ab_ooblayout = {
+	.ecc = mx35lfxge4ab_ooblayout_ecc,
+	.free = mx35lfxge4ab_ooblayout_free,
+};
+
+static int mx35lf1ge4ab_get_eccsr(struct spinand_device *spinand, u8 *eccsr)
+{
+	struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(0x7c, 1),
+					  SPI_MEM_OP_NO_ADDR,
+					  SPI_MEM_OP_DUMMY(1, 1),
+					  SPI_MEM_OP_DATA_IN(1, eccsr, 1));
+
+	return spi_mem_exec_op(spinand->spimem, &op);
+}
+
+static int mx35lf1ge4ab_ecc_get_status(struct spinand_device *spinand,
+				       u8 status)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+	u8 eccsr;
+
+	switch (status & STATUS_ECC_MASK) {
+	case STATUS_ECC_NO_BITFLIPS:
+		return 0;
+
+	case STATUS_ECC_UNCOR_ERROR:
+		return -EBADMSG;
+
+	case STATUS_ECC_HAS_BITFLIPS:
+		/*
+		 * Let's try to retrieve the real maximum number of bitflips
+		 * in order to avoid forcing the wear-leveling layer to move
+		 * data around if it's not necessary.
+		 */
+		if (mx35lf1ge4ab_get_eccsr(spinand, &eccsr))
+			return nand->eccreq.strength;
+
+		if (WARN_ON(eccsr > nand->eccreq.strength || !eccsr))
+			return nand->eccreq.strength;
+
+		return eccsr;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct spinand_info macronix_spinand_table[] = {
+	SPINAND_INFO("MX35LF1GE4AB", 0x12,
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 1, 1, 1),
+		     NAND_ECCREQ(4, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     SPINAND_HAS_QE_BIT,
+		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout,
+				     mx35lf1ge4ab_ecc_get_status)),
+};
+
+static int macronix_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/*
+	 * Macronix SPI NAND read ID needs a dummy byte, so the first byte in
+	 * raw_id is garbage.
+	 */
+	if (id[1] != SPINAND_MFR_MACRONIX)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, macronix_spinand_table,
+				     ARRAY_SIZE(macronix_spinand_table),
+				     id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static const struct spinand_manufacturer_ops macronix_spinand_manuf_ops = {
+	.detect = macronix_spinand_detect,
+};
+
+const struct spinand_manufacturer macronix_spinand_manufacturer = {
+	.id = SPINAND_MFR_MACRONIX,
+	.name = "Macronix",
+	.ops = &macronix_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index f0f16b9029e7..088ff96c3eb6 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -194,6 +194,7 @@ struct spinand_manufacturer {
 };
 
 /* SPI NAND manufacturers */
+extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
 extern const struct spinand_manufacturer winbond_spinand_manufacturer;
 
-- 
cgit v1.2.3


From a4c025372d9d4756e7be98a98f5dde302c6df562 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:27 +0200
Subject: mtd: rawnand: Remove nand_do_read() prototype from rawnand.h

nand_do_read() is a static function implemented in nand_base.c. There's
no good reason to expose its prototype in rawnand.h.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/rawnand.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 0c6fb316b409..4d7a46c42900 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1545,8 +1545,6 @@ int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
 int nand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt);
 int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		    int allowbbt);
-int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
-		 size_t *retlen, uint8_t *buf);
 
 /**
  * struct platform_nand_chip - chip level device structure
-- 
cgit v1.2.3


From 707329aca6e0cf5781ca7f62c39bdcb5f87e9c23 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:28 +0200
Subject: mtd: rawnand: Remove forward declaration of mtd_info

struct mtd_info is defined in linux/mtd/mtd.h which is included
at the beginning of nand_base.c, there's thus no need for the
forward declaration of mtd_info.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/rawnand.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 4d7a46c42900..32145b302585 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -23,7 +23,6 @@
 #include <linux/mtd/bbm.h>
 #include <linux/types.h>
 
-struct mtd_info;
 struct nand_flash_dev;
 struct device_node;
 
-- 
cgit v1.2.3


From 1c3ab61ebcf8cfb5308d2e68e03fd4b4df484e62 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:29 +0200
Subject: mtd: rawnand: Remove forward declaration of device_node

struct device_node is defined in linux/of.h. Let's include this file
instead of having a forward declaration of this struct.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/rawnand.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 32145b302585..83ab6779144e 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -21,10 +21,10 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/bbm.h>
+#include <linux/of.h>
 #include <linux/types.h>
 
 struct nand_flash_dev;
-struct device_node;
 
 /* Scan and identify a NAND device */
 int nand_scan_with_ids(struct mtd_info *mtd, int max_chips,
-- 
cgit v1.2.3


From 44b07b921dea5ec997fb929ceaa82582a7415816 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:30 +0200
Subject: mtd: rawnand: Rename nand_default_bbt() into nand_create_bbt()

Rename nand_default_bbt() into nand_create_bbt() and pass it a nand_chip
object to prepare removal of the chip->scan_bbt() hook.

We add a temporary nand_default_bbt() wrapper which will be dropped
after the removal of ->scan_bbt().

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c |  5 +++++
 drivers/mtd/nand/raw/nand_bbt.c  | 10 +++++-----
 include/linux/mtd/rawnand.h      |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index faac82b1e058..0476e13d47b1 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -4924,6 +4924,11 @@ static void nand_shutdown(struct mtd_info *mtd)
 	nand_get_device(mtd, FL_PM_SUSPENDED);
 }
 
+static int nand_default_bbt(struct mtd_info *mtd)
+{
+	return nand_create_bbt(mtd_to_nand(mtd));
+}
+
 /* Set default functions */
 static void nand_set_defaults(struct nand_chip *chip)
 {
diff --git a/drivers/mtd/nand/raw/nand_bbt.c b/drivers/mtd/nand/raw/nand_bbt.c
index d9f4ceff2568..39db352f8757 100644
--- a/drivers/mtd/nand/raw/nand_bbt.c
+++ b/drivers/mtd/nand/raw/nand_bbt.c
@@ -1349,15 +1349,14 @@ static int nand_create_badblock_pattern(struct nand_chip *this)
 }
 
 /**
- * nand_default_bbt - [NAND Interface] Select a default bad block table for the device
- * @mtd: MTD device structure
+ * nand_create_bbt - [NAND Interface] Select a default bad block table for the device
+ * @this: NAND chip object
  *
  * This function selects the default bad block table support for the device and
  * calls the nand_scan_bbt function.
  */
-int nand_default_bbt(struct mtd_info *mtd)
+int nand_create_bbt(struct nand_chip *this)
 {
-	struct nand_chip *this = mtd_to_nand(mtd);
 	int ret;
 
 	/* Is a flash based bad block table requested? */
@@ -1383,8 +1382,9 @@ int nand_default_bbt(struct mtd_info *mtd)
 			return ret;
 	}
 
-	return nand_scan_bbt(mtd, this->badblock_pattern);
+	return nand_scan_bbt(nand_to_mtd(this), this->badblock_pattern);
 }
+EXPORT_SYMBOL(nand_create_bbt);
 
 /**
  * nand_isreserved_bbt - [NAND Interface] Check if a block is reserved
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 83ab6779144e..186f9fb1e7eb 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1538,7 +1538,7 @@ extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
 extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
 extern const struct nand_manufacturer_ops macronix_nand_manuf_ops;
 
-int nand_default_bbt(struct mtd_info *mtd);
+int nand_create_bbt(struct nand_chip *chip);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
 int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
 int nand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt);
-- 
cgit v1.2.3


From e80eba7581512625083ba540f120479b935425de Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:31 +0200
Subject: mtd: rawnand: Kill the chip->scan_bbt() hook

None of the existing drivers are overloading the ->scan_bbt() method,
let's get rid of it and replace calls to ->scan_bbt() by
nand_create_bbt() ones.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/diskonchip.c          | 4 ++--
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 2 +-
 drivers/mtd/nand/raw/nand_base.c           | 9 +--------
 drivers/mtd/nand/raw/nandsim.c             | 2 +-
 include/linux/mtd/rawnand.h                | 2 --
 5 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c
index 8d10061abb4b..3c46188dd6d2 100644
--- a/drivers/mtd/nand/raw/diskonchip.c
+++ b/drivers/mtd/nand/raw/diskonchip.c
@@ -1291,7 +1291,7 @@ static int __init nftl_scan_bbt(struct mtd_info *mtd)
 		this->bbt_md = NULL;
 	}
 
-	ret = this->scan_bbt(mtd);
+	ret = nand_create_bbt(this);
 	if (ret)
 		return ret;
 
@@ -1338,7 +1338,7 @@ static int __init inftl_scan_bbt(struct mtd_info *mtd)
 		this->bbt_md->pattern = "TBB_SYSM";
 	}
 
-	ret = this->scan_bbt(mtd);
+	ret = nand_create_bbt(this);
 	if (ret)
 		return ret;
 
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 58d4e4a93a93..599513321e71 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -1944,7 +1944,7 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	ret = nand_boot_init(this);
 	if (ret)
 		goto err_nand_cleanup;
-	ret = chip->scan_bbt(mtd);
+	ret = nand_create_bbt(chip);
 	if (ret)
 		goto err_nand_cleanup;
 
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 0476e13d47b1..4fa5e20d9690 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -4924,11 +4924,6 @@ static void nand_shutdown(struct mtd_info *mtd)
 	nand_get_device(mtd, FL_PM_SUSPENDED);
 }
 
-static int nand_default_bbt(struct mtd_info *mtd)
-{
-	return nand_create_bbt(mtd_to_nand(mtd));
-}
-
 /* Set default functions */
 static void nand_set_defaults(struct nand_chip *chip)
 {
@@ -4970,8 +4965,6 @@ static void nand_set_defaults(struct nand_chip *chip)
 		chip->write_byte = busw ? nand_write_byte16 : nand_write_byte;
 	if (!chip->read_buf || chip->read_buf == nand_read_buf)
 		chip->read_buf = busw ? nand_read_buf16 : nand_read_buf;
-	if (!chip->scan_bbt)
-		chip->scan_bbt = nand_default_bbt;
 
 	if (!chip->controller) {
 		chip->controller = &chip->hwcontrol;
@@ -6673,7 +6666,7 @@ int nand_scan_tail(struct mtd_info *mtd)
 		return 0;
 
 	/* Build bad block table */
-	ret = chip->scan_bbt(mtd);
+	ret = nand_create_bbt(chip);
 	if (ret)
 		goto err_nand_manuf_cleanup;
 
diff --git a/drivers/mtd/nand/raw/nandsim.c b/drivers/mtd/nand/raw/nandsim.c
index f8edacde49ab..8a3b36cfe5ea 100644
--- a/drivers/mtd/nand/raw/nandsim.c
+++ b/drivers/mtd/nand/raw/nandsim.c
@@ -2337,7 +2337,7 @@ static int __init ns_init_module(void)
 	if ((retval = init_nandsim(nsmtd)) != 0)
 		goto err_exit;
 
-	if ((retval = chip->scan_bbt(nsmtd)) != 0)
+	if ((retval = nand_create_bbt(chip)) != 0)
 		goto err_exit;
 
 	if ((retval = parse_badblocks(nand, nsmtd)) != 0)
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 186f9fb1e7eb..ac0007ddadf2 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1199,7 +1199,6 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
  * @buf_align:		minimum buffer alignment required by a platform
  * @hwcontrol:		platform-specific hardware control structure
  * @erase:		[REPLACEABLE] erase function
- * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transferring
  *			data from array to read regs (tR).
  * @state:		[INTERN] the current state of the NAND device
@@ -1292,7 +1291,6 @@ struct nand_chip {
 		       const struct nand_operation *op,
 		       bool check_only);
 	int (*erase)(struct mtd_info *mtd, int page);
-	int (*scan_bbt)(struct mtd_info *mtd);
 	int (*set_features)(struct mtd_info *mtd, struct nand_chip *chip,
 			    int feature_addr, uint8_t *subfeature_para);
 	int (*get_features)(struct mtd_info *mtd, struct nand_chip *chip,
-- 
cgit v1.2.3


From f0f01838f76420988b50b23f315704c9a5cd2fa9 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:32 +0200
Subject: mtd: rawnand: orion_nand: Kill orion_nand_data.dev_ready()

None of the boards seem to overload the ->dev_ready() hook, just drop
this field from orion_nand_data.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/orion_nand.c            | 3 ---
 include/linux/platform_data/mtd-orion_nand.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/orion_nand.c b/drivers/mtd/nand/raw/orion_nand.c
index 7825fd3ce66b..1a4828442675 100644
--- a/drivers/mtd/nand/raw/orion_nand.c
+++ b/drivers/mtd/nand/raw/orion_nand.c
@@ -153,9 +153,6 @@ static int __init orion_nand_probe(struct platform_device *pdev)
 	if (board->width == 16)
 		nc->options |= NAND_BUSWIDTH_16;
 
-	if (board->dev_ready)
-		nc->dev_ready = board->dev_ready;
-
 	platform_set_drvdata(pdev, info);
 
 	/* Not all platforms can gate the clock, so it is not
diff --git a/include/linux/platform_data/mtd-orion_nand.h b/include/linux/platform_data/mtd-orion_nand.h
index a7ce77c7c1a8..34828eb85982 100644
--- a/include/linux/platform_data/mtd-orion_nand.h
+++ b/include/linux/platform_data/mtd-orion_nand.h
@@ -12,7 +12,6 @@
  */
 struct orion_nand_data {
 	struct mtd_partition *parts;
-	int (*dev_ready)(struct mtd_info *mtd);
 	u32 nr_parts;
 	u8 ale;		/* address line number connected to ALE */
 	u8 cle;		/* address line number connected to CLE */
-- 
cgit v1.2.3


From dc2d8856a758627d4f126d17bc113eedd72b2d60 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Thu, 5 Jul 2018 12:27:33 +0200
Subject: mtd: rawnand: plat_nand: Kill pdata->ctrl.{hwcontrol, read_byte}()

None of the board files are overloading those hooks, so let's drop them
from struct platform_nand_ctrl.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/arm/mach-pxa/balloon3.c     | 1 -
 arch/arm/mach-pxa/em-x270.c      | 1 -
 drivers/mtd/nand/raw/plat_nand.c | 2 --
 include/linux/mtd/rawnand.h      | 4 ----
 4 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c
index f4f8f23bda8c..af46d2182533 100644
--- a/arch/arm/mach-pxa/balloon3.c
+++ b/arch/arm/mach-pxa/balloon3.c
@@ -688,7 +688,6 @@ struct platform_nand_data balloon3_nand_pdata = {
 		.chip_delay	= 50,
 	},
 	.ctrl = {
-		.hwcontrol	= 0,
 		.dev_ready	= balloon3_nand_dev_ready,
 		.select_chip	= balloon3_nand_select_chip,
 		.cmd_ctrl	= balloon3_nand_cmd_ctl,
diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c
index 49022ad338e9..29be04c6cc48 100644
--- a/arch/arm/mach-pxa/em-x270.c
+++ b/arch/arm/mach-pxa/em-x270.c
@@ -346,7 +346,6 @@ struct platform_nand_data em_x270_nand_platdata = {
 		.chip_delay = 20,
 	},
 	.ctrl = {
-		.hwcontrol = 0,
 		.dev_ready = em_x270_nand_device_ready,
 		.select_chip = 0,
 		.cmd_ctrl = em_x270_nand_cmd_ctl,
diff --git a/drivers/mtd/nand/raw/plat_nand.c b/drivers/mtd/nand/raw/plat_nand.c
index 925a1323604d..222626df4b96 100644
--- a/drivers/mtd/nand/raw/plat_nand.c
+++ b/drivers/mtd/nand/raw/plat_nand.c
@@ -67,12 +67,10 @@ static int plat_nand_probe(struct platform_device *pdev)
 	data->chip.select_chip = pdata->ctrl.select_chip;
 	data->chip.write_buf = pdata->ctrl.write_buf;
 	data->chip.read_buf = pdata->ctrl.read_buf;
-	data->chip.read_byte = pdata->ctrl.read_byte;
 	data->chip.chip_delay = pdata->chip.chip_delay;
 	data->chip.options |= pdata->chip.options;
 	data->chip.bbt_options |= pdata->chip.bbt_options;
 
-	data->chip.ecc.hwctl = pdata->ctrl.hwcontrol;
 	data->chip.ecc.mode = NAND_ECC_SOFT;
 	data->chip.ecc.algo = NAND_ECC_HAMMING;
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index ac0007ddadf2..11c2426fc363 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1572,14 +1572,12 @@ struct platform_device;
  * struct platform_nand_ctrl - controller level device structure
  * @probe:		platform specific function to probe/setup hardware
  * @remove:		platform specific function to remove/teardown hardware
- * @hwcontrol:		platform specific hardware control structure
  * @dev_ready:		platform specific function to read ready/busy pin
  * @select_chip:	platform specific chip select function
  * @cmd_ctrl:		platform specific function for controlling
  *			ALE/CLE/nCE. Also used to write command and address
  * @write_buf:		platform specific function for write buffer
  * @read_buf:		platform specific function for read buffer
- * @read_byte:		platform specific function to read one byte from chip
  * @priv:		private data to transport driver specific settings
  *
  * All fields are optional and depend on the hardware driver requirements
@@ -1587,13 +1585,11 @@ struct platform_device;
 struct platform_nand_ctrl {
 	int (*probe)(struct platform_device *pdev);
 	void (*remove)(struct platform_device *pdev);
-	void (*hwcontrol)(struct mtd_info *mtd, int cmd);
 	int (*dev_ready)(struct mtd_info *mtd);
 	void (*select_chip)(struct mtd_info *mtd, int chip);
 	void (*cmd_ctrl)(struct mtd_info *mtd, int dat, unsigned int ctrl);
 	void (*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
 	void (*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
-	unsigned char (*read_byte)(struct mtd_info *mtd);
 	void *priv;
 };
 
-- 
cgit v1.2.3


From dc2865ac3527d76fe04ee1f1a3ffc101a60faba0 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Mon, 9 Jul 2018 22:09:40 +0200
Subject: MIPS: txx9: Move the ndfc.h header to
 include/linux/platform_data/txx9

This way we will be able to compile the ndfmc driver when
COMPILE_TEST=y.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Acked-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/mips/include/asm/txx9/ndfmc.h       | 30 ------------------------------
 arch/mips/txx9/generic/setup.c           |  2 +-
 arch/mips/txx9/generic/setup_tx4938.c    |  2 +-
 arch/mips/txx9/generic/setup_tx4939.c    |  2 +-
 drivers/mtd/nand/raw/txx9ndfmc.c         |  2 +-
 include/linux/platform_data/txx9/ndfmc.h | 30 ++++++++++++++++++++++++++++++
 6 files changed, 34 insertions(+), 34 deletions(-)
 delete mode 100644 arch/mips/include/asm/txx9/ndfmc.h
 create mode 100644 include/linux/platform_data/txx9/ndfmc.h

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/txx9/ndfmc.h b/arch/mips/include/asm/txx9/ndfmc.h
deleted file mode 100644
index fa67f3df78fc..000000000000
--- a/arch/mips/include/asm/txx9/ndfmc.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * (C) Copyright TOSHIBA CORPORATION 2007
- */
-#ifndef __ASM_TXX9_NDFMC_H
-#define __ASM_TXX9_NDFMC_H
-
-#define NDFMC_PLAT_FLAG_USE_BSPRT	0x01
-#define NDFMC_PLAT_FLAG_NO_RSTR		0x02
-#define NDFMC_PLAT_FLAG_HOLDADD		0x04
-#define NDFMC_PLAT_FLAG_DUMMYWRITE	0x08
-
-struct txx9ndfmc_platform_data {
-	unsigned int shift;
-	unsigned int gbus_clock;
-	unsigned int hold;		/* hold time in nanosecond */
-	unsigned int spw;		/* strobe pulse width in nanosecond */
-	unsigned int flags;
-	unsigned char ch_mask;		/* available channel bitmask */
-	unsigned char wp_mask;		/* write-protect bitmask */
-	unsigned char wide_mask;	/* 16bit-nand bitmask */
-};
-
-void txx9_ndfmc_init(unsigned long baseaddr,
-		     const struct txx9ndfmc_platform_data *plat_data);
-
-#endif /* __ASM_TXX9_NDFMC_H */
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c
index 1791a44ee570..aa47932abd28 100644
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/gpio/driver.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/txx9/ndfmc.h>
 #include <linux/serial_core.h>
 #include <linux/mtd/physmap.h>
 #include <linux/leds.h>
@@ -35,7 +36,6 @@
 #include <asm/txx9/generic.h>
 #include <asm/txx9/pci.h>
 #include <asm/txx9tmr.h>
-#include <asm/txx9/ndfmc.h>
 #include <asm/txx9/dmac.h>
 #ifdef CONFIG_CPU_TX49XX
 #include <asm/txx9/tx4938.h>
diff --git a/arch/mips/txx9/generic/setup_tx4938.c b/arch/mips/txx9/generic/setup_tx4938.c
index 85d1795652da..17395d5d15ca 100644
--- a/arch/mips/txx9/generic/setup_tx4938.c
+++ b/arch/mips/txx9/generic/setup_tx4938.c
@@ -17,13 +17,13 @@
 #include <linux/ptrace.h>
 #include <linux/mtd/physmap.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/txx9/ndfmc.h>
 #include <asm/reboot.h>
 #include <asm/traps.h>
 #include <asm/txx9irq.h>
 #include <asm/txx9tmr.h>
 #include <asm/txx9pio.h>
 #include <asm/txx9/generic.h>
-#include <asm/txx9/ndfmc.h>
 #include <asm/txx9/dmac.h>
 #include <asm/txx9/tx4938.h>
 
diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c
index 274928987a21..360c388f4c82 100644
--- a/arch/mips/txx9/generic/setup_tx4939.c
+++ b/arch/mips/txx9/generic/setup_tx4939.c
@@ -21,13 +21,13 @@
 #include <linux/ptrace.h>
 #include <linux/mtd/physmap.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/txx9/ndfmc.h>
 #include <asm/bootinfo.h>
 #include <asm/reboot.h>
 #include <asm/traps.h>
 #include <asm/txx9irq.h>
 #include <asm/txx9tmr.h>
 #include <asm/txx9/generic.h>
-#include <asm/txx9/ndfmc.h>
 #include <asm/txx9/dmac.h>
 #include <asm/txx9/tx4939.h>
 
diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c
index b567d212fe7d..04d57474ef97 100644
--- a/drivers/mtd/nand/raw/txx9ndfmc.c
+++ b/drivers/mtd/nand/raw/txx9ndfmc.c
@@ -20,7 +20,7 @@
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
-#include <asm/txx9/ndfmc.h>
+#include <linux/platform_data/txx9/ndfmc.h>
 
 /* TXX9 NDFMC Registers */
 #define TXX9_NDFDTR	0x00
diff --git a/include/linux/platform_data/txx9/ndfmc.h b/include/linux/platform_data/txx9/ndfmc.h
new file mode 100644
index 000000000000..fc172627d54e
--- /dev/null
+++ b/include/linux/platform_data/txx9/ndfmc.h
@@ -0,0 +1,30 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * (C) Copyright TOSHIBA CORPORATION 2007
+ */
+#ifndef __TXX9_NDFMC_H
+#define __TXX9_NDFMC_H
+
+#define NDFMC_PLAT_FLAG_USE_BSPRT	0x01
+#define NDFMC_PLAT_FLAG_NO_RSTR		0x02
+#define NDFMC_PLAT_FLAG_HOLDADD		0x04
+#define NDFMC_PLAT_FLAG_DUMMYWRITE	0x08
+
+struct txx9ndfmc_platform_data {
+	unsigned int shift;
+	unsigned int gbus_clock;
+	unsigned int hold;		/* hold time in nanosecond */
+	unsigned int spw;		/* strobe pulse width in nanosecond */
+	unsigned int flags;
+	unsigned char ch_mask;		/* available channel bitmask */
+	unsigned char wp_mask;		/* write-protect bitmask */
+	unsigned char wide_mask;	/* 16bit-nand bitmask */
+};
+
+void txx9_ndfmc_init(unsigned long baseaddr,
+		     const struct txx9ndfmc_platform_data *plat_data);
+
+#endif /* __TXX9_NDFMC_H */
-- 
cgit v1.2.3


From e65e3a50702f4e7a01c0b36ff08d6ed8dde7ee7b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Mon, 9 Jul 2018 22:09:42 +0200
Subject: MIPS: jz4740: Move jz4740_nand.h header to
 include/linux/platform_data/jz4740

This way we will be able to compile the jz4740_nand driver when
COMPILE_TEST=y.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Acked-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/mips/include/asm/mach-jz4740/jz4740_nand.h  | 34 ------------------------
 arch/mips/jz4740/board-qi_lb60.c                 |  3 ++-
 drivers/mtd/nand/raw/jz4740_nand.c               |  2 +-
 include/linux/platform_data/jz4740/jz4740_nand.h | 34 ++++++++++++++++++++++++
 4 files changed, 37 insertions(+), 36 deletions(-)
 delete mode 100644 arch/mips/include/asm/mach-jz4740/jz4740_nand.h
 create mode 100644 include/linux/platform_data/jz4740/jz4740_nand.h

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/mach-jz4740/jz4740_nand.h b/arch/mips/include/asm/mach-jz4740/jz4740_nand.h
deleted file mode 100644
index f381d465e768..000000000000
--- a/arch/mips/include/asm/mach-jz4740/jz4740_nand.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
- *  JZ4740 SoC NAND controller driver
- *
- *  This program is free software; you can redistribute	 it and/or modify it
- *  under  the terms of	 the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the	License, or (at your
- *  option) any later version.
- *
- *  You should have received a copy of the  GNU General Public License along
- *  with this program; if not, write  to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#ifndef __ASM_MACH_JZ4740_JZ4740_NAND_H__
-#define __ASM_MACH_JZ4740_JZ4740_NAND_H__
-
-#include <linux/mtd/rawnand.h>
-#include <linux/mtd/partitions.h>
-
-#define JZ_NAND_NUM_BANKS 4
-
-struct jz_nand_platform_data {
-	int			num_partitions;
-	struct mtd_partition	*partitions;
-
-	unsigned char banks[JZ_NAND_NUM_BANKS];
-
-	void (*ident_callback)(struct platform_device *, struct mtd_info *,
-				struct mtd_partition **, int *num_partitions);
-};
-
-#endif
diff --git a/arch/mips/jz4740/board-qi_lb60.c b/arch/mips/jz4740/board-qi_lb60.c
index 60f0767507c6..af0c8ace0141 100644
--- a/arch/mips/jz4740/board-qi_lb60.c
+++ b/arch/mips/jz4740/board-qi_lb60.c
@@ -29,10 +29,11 @@
 #include <linux/power/gpio-charger.h>
 #include <linux/pwm.h>
 
+#include <linux/platform_data/jz4740/jz4740_nand.h>
+
 #include <asm/mach-jz4740/gpio.h>
 #include <asm/mach-jz4740/jz4740_fb.h>
 #include <asm/mach-jz4740/jz4740_mmc.h>
-#include <asm/mach-jz4740/jz4740_nand.h>
 
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
diff --git a/drivers/mtd/nand/raw/jz4740_nand.c b/drivers/mtd/nand/raw/jz4740_nand.c
index 613b00a9604b..a0254461812d 100644
--- a/drivers/mtd/nand/raw/jz4740_nand.c
+++ b/drivers/mtd/nand/raw/jz4740_nand.c
@@ -25,7 +25,7 @@
 
 #include <linux/gpio.h>
 
-#include <asm/mach-jz4740/jz4740_nand.h>
+#include <linux/platform_data/jz4740/jz4740_nand.h>
 
 #define JZ_REG_NAND_CTRL	0x50
 #define JZ_REG_NAND_ECC_CTRL	0x100
diff --git a/include/linux/platform_data/jz4740/jz4740_nand.h b/include/linux/platform_data/jz4740/jz4740_nand.h
new file mode 100644
index 000000000000..bc571f6d5ced
--- /dev/null
+++ b/include/linux/platform_data/jz4740/jz4740_nand.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
+ *  JZ4740 SoC NAND controller driver
+ *
+ *  This program is free software; you can redistribute	 it and/or modify it
+ *  under  the terms of	 the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the	License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the  GNU General Public License along
+ *  with this program; if not, write  to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __JZ4740_NAND_H__
+#define __JZ4740_NAND_H__
+
+#include <linux/mtd/rawnand.h>
+#include <linux/mtd/partitions.h>
+
+#define JZ_NAND_NUM_BANKS 4
+
+struct jz_nand_platform_data {
+	int			num_partitions;
+	struct mtd_partition	*partitions;
+
+	unsigned char banks[JZ_NAND_NUM_BANKS];
+
+	void (*ident_callback)(struct platform_device *, struct mtd_info *,
+				struct mtd_partition **, int *num_partitions);
+};
+
+#endif
-- 
cgit v1.2.3


From be2ab5b4d5c0bf041a34ec2e1397d50afbfb095e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:12 +0200
Subject: netfilter: nf_tables: take module reference when starting a batch

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h | 1 +
 net/netfilter/nf_tables_api.c       | 1 +
 net/netfilter/nfnetlink.c           | 9 +++++++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 3ecc3050be0e..4a520d3304a2 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -29,6 +29,7 @@ struct nfnetlink_subsystem {
 	__u8 subsys_id;			/* nfnetlink subsystem ID */
 	__u8 cb_count;			/* number of callbacks */
 	const struct nfnl_callback *cb;	/* callback for individual types */
+	struct module *owner;
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
 	void (*cleanup)(struct net *net);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 594b395442d6..c16c481fc52a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6603,6 +6603,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.abort		= nf_tables_abort,
 	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
+	.owner		= THIS_MODULE,
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 94f9bcaa0799..dd1d7bc23b03 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -337,7 +337,14 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (!try_module_get(ss->owner)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
+		return kfree_skb(skb);
+	}
+
 	if (!ss->valid_genid(net, genid)) {
+		module_put(ss->owner);
 		nfnl_unlock(subsys_id);
 		netlink_ack(oskb, nlh, -ERESTART, NULL);
 		return kfree_skb(skb);
@@ -472,6 +479,7 @@ done:
 		nfnl_err_reset(&err_list);
 		nfnl_unlock(subsys_id);
 		kfree_skb(skb);
+		module_put(ss->owner);
 		goto replay;
 	} else if (status == NFNL_BATCH_DONE) {
 		err = ss->commit(net, oskb);
@@ -491,6 +499,7 @@ done:
 	nfnl_err_deliver(&err_list, oskb);
 	nfnl_unlock(subsys_id);
 	kfree_skb(skb);
+	module_put(ss->owner);
 }
 
 static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
-- 
cgit v1.2.3


From e2861fa71641c6414831d628a1f4f793b6562580 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Fri, 8 Jun 2018 14:57:42 -0700
Subject: evm: Don't deadlock if a crypto algorithm is unavailable

When EVM attempts to appraise a file signed with a crypto algorithm the
kernel doesn't have support for, it will cause the kernel to trigger a
module load. If the EVM policy includes appraisal of kernel modules this
will in turn call back into EVM - since EVM is holding a lock until the
crypto initialisation is complete, this triggers a deadlock. Add a
CRYPTO_NOLOAD flag and skip module loading if it's set, and add that flag
in the EVM case in order to fail gracefully with an error message
instead of deadlocking.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 crypto/api.c                        | 2 +-
 include/linux/crypto.h              | 5 +++++
 security/integrity/evm/evm_crypto.c | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 0ee632bba064..7aca9f86c5f3 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -229,7 +229,7 @@ static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type,
 	mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
 
 	alg = crypto_alg_lookup(name, type, mask);
-	if (!alg) {
+	if (!alg && !(mask & CRYPTO_NOLOAD)) {
 		request_module("crypto-%s", name);
 
 		if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask &
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 6eb06101089f..e8839d3a7559 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -112,6 +112,11 @@
  */
 #define CRYPTO_ALG_OPTIONAL_KEY		0x00004000
 
+/*
+ * Don't trigger module loading
+ */
+#define CRYPTO_NOLOAD			0x00008000
+
 /*
  * Transform masks and values (for crt_flags).
  */
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index b60524310855..c20e3142b541 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -97,7 +97,8 @@ static struct shash_desc *init_desc(char type)
 		mutex_lock(&mutex);
 		if (*tfm)
 			goto out;
-		*tfm = crypto_alloc_shash(algo, 0, CRYPTO_ALG_ASYNC);
+		*tfm = crypto_alloc_shash(algo, 0,
+					  CRYPTO_ALG_ASYNC | CRYPTO_NOLOAD);
 		if (IS_ERR(*tfm)) {
 			rc = PTR_ERR(*tfm);
 			pr_err("Can not allocate %s (reason: %ld)\n", algo, rc);
-- 
cgit v1.2.3


From 6eb864c1d9dd1ef32b88e03c3f49d8be0dab7dcf Mon Sep 17 00:00:00 2001
From: Mikhail Kurinnoi <viewizard@viewizard.com>
Date: Wed, 27 Jun 2018 16:33:42 +0300
Subject: integrity: prevent deadlock during digsig verification.

This patch aimed to prevent deadlock during digsig verification.The point
of issue - user space utility modprobe and/or it's dependencies (ld-*.so,
libz.so.*, libc-*.so and /lib/modules/ files) that could be used for
kernel modules load during digsig verification and could be signed by
digsig in the same time.

First at all, look at crypto_alloc_tfm() work algorithm:
crypto_alloc_tfm() will first attempt to locate an already loaded
algorithm. If that fails and the kernel supports dynamically loadable
modules, it will then attempt to load a module of the same name or alias.
If that fails it will send a query to any loaded crypto manager to
construct an algorithm on the fly.

We have situation, when public_key_verify_signature() in case of RSA
algorithm use alg_name to store internal information in order to construct
an algorithm on the fly, but crypto_larval_lookup() will try to use
alg_name in order to load kernel module with same name.

1) we can't do anything with crypto module work, since it designed to work
exactly in this way;
2) we can't globally filter module requests for modprobe, since it
designed to work with any requests.

In this patch, I propose add an exception for "crypto-pkcs1pad(rsa,*)"
module requests only in case of enabled integrity asymmetric keys support.
Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules for
sure, we are safe to fail such module request from crypto_larval_lookup().
In this way we prevent modprobe execution during digsig verification and
avoid possible deadlock if modprobe and/or it's dependencies also signed
with digsig.

Requested "crypto-pkcs1pad(rsa,*)" kernel module name formed by:
1) "pkcs1pad(rsa,%s)" in public_key_verify_signature();
2) "crypto-%s" / "crypto-%s-all" in crypto_larval_lookup().
"crypto-pkcs1pad(rsa," part of request is a constant and unique and could
be used as filter.

Signed-off-by: Mikhail Kurinnoi <viewizard@viewizard.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>

 include/linux/integrity.h              | 13 +++++++++++++
 security/integrity/digsig_asymmetric.c | 23 +++++++++++++++++++++++
 security/security.c                    |  7 ++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)
---
 include/linux/integrity.h              | 13 +++++++++++++
 security/integrity/digsig_asymmetric.c | 23 +++++++++++++++++++++++
 security/security.c                    |  7 ++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/integrity.h b/include/linux/integrity.h
index 858d3f4a2241..54c853ec2fd1 100644
--- a/include/linux/integrity.h
+++ b/include/linux/integrity.h
@@ -44,4 +44,17 @@ static inline void integrity_load_keys(void)
 }
 #endif /* CONFIG_INTEGRITY */
 
+#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
+
+extern int integrity_kernel_module_request(char *kmod_name);
+
+#else
+
+static inline int integrity_kernel_module_request(char *kmod_name)
+{
+	return 0;
+}
+
+#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */
+
 #endif /* _LINUX_INTEGRITY_H */
diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c
index ab6a029062a1..6dc075144508 100644
--- a/security/integrity/digsig_asymmetric.c
+++ b/security/integrity/digsig_asymmetric.c
@@ -115,3 +115,26 @@ int asymmetric_verify(struct key *keyring, const char *sig,
 	pr_debug("%s() = %d\n", __func__, ret);
 	return ret;
 }
+
+/**
+ * integrity_kernel_module_request - prevent crypto-pkcs1pad(rsa,*) requests
+ * @kmod_name: kernel module name
+ *
+ * We have situation, when public_key_verify_signature() in case of RSA
+ * algorithm use alg_name to store internal information in order to
+ * construct an algorithm on the fly, but crypto_larval_lookup() will try
+ * to use alg_name in order to load kernel module with same name.
+ * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules,
+ * we are safe to fail such module request from crypto_larval_lookup().
+ *
+ * In this way we prevent modprobe execution during digsig verification
+ * and avoid possible deadlock if modprobe and/or it's dependencies
+ * also signed with digsig.
+ */
+int integrity_kernel_module_request(char *kmod_name)
+{
+	if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0)
+		return -EINVAL;
+
+	return 0;
+}
diff --git a/security/security.c b/security/security.c
index b49ee810371b..dbca03d3629b 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1032,7 +1032,12 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode)
 
 int security_kernel_module_request(char *kmod_name)
 {
-	return call_int_hook(kernel_module_request, 0, kmod_name);
+	int ret;
+
+	ret = call_int_hook(kernel_module_request, 0, kmod_name);
+	if (ret)
+		return ret;
+	return integrity_kernel_module_request(kmod_name);
 }
 
 int security_kernel_read_file(struct file *file, enum kernel_read_file_id id)
-- 
cgit v1.2.3


From d29ab6e1fa21ebc2a8a771015dd9e0e5d4e28dc1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 13 Jul 2018 12:41:10 -0700
Subject: bpf: bpf_prog_array_alloc() should return a generic non-rcu pointer

Currently the return type of the bpf_prog_array_alloc() is
struct bpf_prog_array __rcu *, which is not quite correct.
Obviously, the returned pointer is a generic pointer, which
is valid for an indefinite amount of time and it's not shared
with anyone else, so there is no sense in marking it as __rcu.

This change eliminate the following sparse warnings:
kernel/bpf/core.c:1544:31: warning: incorrect type in return expression (different address spaces)
kernel/bpf/core.c:1544:31:    expected struct bpf_prog_array [noderef] <asn:4>*
kernel/bpf/core.c:1544:31:    got void *
kernel/bpf/core.c:1548:17: warning: incorrect type in return expression (different address spaces)
kernel/bpf/core.c:1548:17:    expected struct bpf_prog_array [noderef] <asn:4>*
kernel/bpf/core.c:1548:17:    got struct bpf_prog_array *<noident>
kernel/bpf/core.c:1681:15: warning: incorrect type in assignment (different address spaces)
kernel/bpf/core.c:1681:15:    expected struct bpf_prog_array *array
kernel/bpf/core.c:1681:15:    got struct bpf_prog_array [noderef] <asn:4>*

Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf")
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 2 +-
 kernel/bpf/core.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8827e797ff97..943fb08d8287 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -352,7 +352,7 @@ struct bpf_prog_array {
 	struct bpf_prog *progs[0];
 };
 
-struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1e5625d46414..253aa8e79c7b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1538,7 +1538,7 @@ static struct {
 	.null_prog = NULL,
 };
 
-struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 {
 	if (prog_cnt)
 		return kzalloc(sizeof(struct bpf_prog_array) +
-- 
cgit v1.2.3


From 09728266b6f99ab57cd4f84f3eead65b7b65dbf7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 17 Jul 2018 10:53:23 -0700
Subject: bpf: offload: rename bpf_offload_dev_match() to
 bpf_offload_prog_map_match()

A set of new API functions exported for the drivers will soon use
'bpf_offload_dev_' as a prefix.  Rename the bpf_offload_dev_match()
which is internal to the core (used by the verifier) to avoid any
confusion.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   | 2 +-
 kernel/bpf/offload.c  | 2 +-
 kernel/bpf/verifier.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 943fb08d8287..9b010d9129f3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -648,7 +648,7 @@ int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
 int bpf_map_offload_get_next_key(struct bpf_map *map,
 				 void *key, void *next_key);
 
-bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
+bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ac747d5cf7c6..6184e48703f4 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -468,7 +468,7 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
 	return 0;
 }
 
-bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
 {
 	struct bpf_offloaded_map *offmap;
 	struct bpf_prog_offload *offload;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e2bf834f13a..15d69b278277 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5054,7 +5054,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 	}
 
 	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
-	    !bpf_offload_dev_match(prog, map)) {
+	    !bpf_offload_prog_map_match(prog, map)) {
 		verbose(env, "offload device mismatch between prog and map\n");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 9fd7c5559165f4c679b40c5e6ad442955832dfad Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 17 Jul 2018 10:53:24 -0700
Subject: bpf: offload: aggregate offloads per-device

Currently we have two lists of offloaded objects - programs and maps.
Netdevice unregister notifier scans those lists to orphan objects
associated with device being unregistered.  This puts unnecessary
(even if negligible) burden on all netdev unregister calls in BPF-
-enabled kernel.  The lists of objects may potentially get long
making the linear scan even more problematic.  There haven't been
complaints about this mechanisms so far, but it is suboptimal.

Instead of relying on notifiers, make the few BPF-capable drivers
register explicitly for BPF offloads.  The programs and maps will
now be collected per-device not on a global list, and only scanned
for removal when driver unregisters from BPF offloads.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c |  13 +++
 drivers/net/netdevsim/bpf.c                   |   7 ++
 include/linux/bpf.h                           |   3 +
 kernel/bpf/offload.c                          | 142 +++++++++++++++++---------
 4 files changed, 119 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index b95b94d008cf..dee039ada75c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -404,6 +404,16 @@ err_release_free:
 	return -EINVAL;
 }
 
+static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev)
+{
+	return bpf_offload_dev_netdev_register(netdev);
+}
+
+static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev)
+{
+	bpf_offload_dev_netdev_unregister(netdev);
+}
+
 static int nfp_bpf_init(struct nfp_app *app)
 {
 	struct nfp_app_bpf *bpf;
@@ -466,6 +476,9 @@ const struct nfp_app_type app_bpf = {
 
 	.extra_cap	= nfp_bpf_extra_cap,
 
+	.ndo_init	= nfp_bpf_ndo_init,
+	.ndo_uninit	= nfp_bpf_ndo_uninit,
+
 	.vnic_alloc	= nfp_bpf_vnic_alloc,
 	.vnic_free	= nfp_bpf_vnic_free,
 
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 357f9e62f306..c4a2829e0e1f 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -582,6 +582,8 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 
 int nsim_bpf_init(struct netdevsim *ns)
 {
+	int err;
+
 	if (ns->sdev->refcnt == 1) {
 		INIT_LIST_HEAD(&ns->sdev->bpf_bound_progs);
 		INIT_LIST_HEAD(&ns->sdev->bpf_bound_maps);
@@ -592,6 +594,10 @@ int nsim_bpf_init(struct netdevsim *ns)
 			return -ENOMEM;
 	}
 
+	err = bpf_offload_dev_netdev_register(ns->netdev);
+	if (err)
+		return err;
+
 	debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir,
 			   &ns->bpf_offloaded_id);
 
@@ -625,6 +631,7 @@ void nsim_bpf_uninit(struct netdevsim *ns)
 	WARN_ON(ns->xdp.prog);
 	WARN_ON(ns->xdp_hw.prog);
 	WARN_ON(ns->bpf_offloaded);
+	bpf_offload_dev_netdev_unregister(ns->netdev);
 
 	if (ns->sdev->refcnt == 1) {
 		WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs));
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9b010d9129f3..aa2e834a122b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -650,6 +650,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map,
 
 bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);
 
+int bpf_offload_dev_netdev_register(struct net_device *netdev);
+void bpf_offload_dev_netdev_unregister(struct net_device *netdev);
+
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 6184e48703f4..cd64a26807aa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -18,19 +18,37 @@
 #include <linux/bug.h>
 #include <linux/kdev_t.h>
 #include <linux/list.h>
+#include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
 #include <linux/proc_ns.h>
+#include <linux/rhashtable.h>
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
-/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+/* Protects offdevs, members of bpf_offload_netdev and offload members
  * of all progs.
  * RTNL lock cannot be taken when holding this lock.
  */
 static DECLARE_RWSEM(bpf_devs_lock);
-static LIST_HEAD(bpf_prog_offload_devs);
-static LIST_HEAD(bpf_map_offload_devs);
+
+struct bpf_offload_netdev {
+	struct rhash_head l;
+	struct net_device *netdev;
+	struct list_head progs;
+	struct list_head maps;
+};
+
+static const struct rhashtable_params offdevs_params = {
+	.nelem_hint		= 4,
+	.key_len		= sizeof(struct net_device *),
+	.key_offset		= offsetof(struct bpf_offload_netdev, netdev),
+	.head_offset		= offsetof(struct bpf_offload_netdev, l),
+	.automatic_shrinking	= true,
+};
+
+static struct rhashtable offdevs;
+static bool offdevs_inited;
 
 static int bpf_dev_offload_check(struct net_device *netdev)
 {
@@ -41,8 +59,19 @@ static int bpf_dev_offload_check(struct net_device *netdev)
 	return 0;
 }
 
+static struct bpf_offload_netdev *
+bpf_offload_find_netdev(struct net_device *netdev)
+{
+	lockdep_assert_held(&bpf_devs_lock);
+
+	if (!offdevs_inited)
+		return NULL;
+	return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+}
+
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 {
+	struct bpf_offload_netdev *ondev;
 	struct bpf_prog_offload *offload;
 	int err;
 
@@ -66,12 +95,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 		goto err_maybe_put;
 
 	down_write(&bpf_devs_lock);
-	if (offload->netdev->reg_state != NETREG_REGISTERED) {
+	ondev = bpf_offload_find_netdev(offload->netdev);
+	if (!ondev) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
 	prog->aux->offload = offload;
-	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	list_add_tail(&offload->offloads, &ondev->progs);
 	dev_put(offload->netdev);
 	up_write(&bpf_devs_lock);
 
@@ -294,6 +324,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 {
 	struct net *net = current->nsproxy->net_ns;
+	struct bpf_offload_netdev *ondev;
 	struct bpf_offloaded_map *offmap;
 	int err;
 
@@ -316,11 +347,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto err_unlock;
 
+	ondev = bpf_offload_find_netdev(offmap->netdev);
+	if (!ondev) {
+		err = -EINVAL;
+		goto err_unlock;
+	}
+
 	err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
 	if (err)
 		goto err_unlock;
 
-	list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+	list_add_tail(&offmap->offloads, &ondev->maps);
 	up_write(&bpf_devs_lock);
 	rtnl_unlock();
 
@@ -489,56 +526,69 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
 	return ret;
 }
 
-static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+int bpf_offload_dev_netdev_register(struct net_device *netdev)
 {
-	struct bpf_prog_offload *offload, *tmp;
+	struct bpf_offload_netdev *ondev;
+	int err;
 
-	list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
-		if (offload->netdev == netdev)
-			__bpf_prog_offload_destroy(offload->prog);
-}
+	down_write(&bpf_devs_lock);
+	if (!offdevs_inited) {
+		err = rhashtable_init(&offdevs, &offdevs_params);
+		if (err)
+			return err;
+		offdevs_inited = true;
+	}
+	up_write(&bpf_devs_lock);
 
-static void bpf_offload_orphan_all_maps(struct net_device *netdev)
-{
-	struct bpf_offloaded_map *offmap, *tmp;
+	ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+	if (!ondev)
+		return -ENOMEM;
+
+	ondev->netdev = netdev;
+	INIT_LIST_HEAD(&ondev->progs);
+	INIT_LIST_HEAD(&ondev->maps);
+
+	down_write(&bpf_devs_lock);
+	err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+	if (err) {
+		netdev_warn(netdev, "failed to register for BPF offload\n");
+		goto err_unlock_free;
+	}
 
-	list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
-		if (offmap->netdev == netdev)
-			__bpf_map_offload_destroy(offmap);
+	up_write(&bpf_devs_lock);
+	return 0;
+
+err_unlock_free:
+	up_write(&bpf_devs_lock);
+	kfree(ondev);
+	return err;
 }
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
 
-static int bpf_offload_notification(struct notifier_block *notifier,
-				    ulong event, void *ptr)
+void bpf_offload_dev_netdev_unregister(struct net_device *netdev)
 {
-	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_offloaded_map *offmap, *mtmp;
+	struct bpf_prog_offload *offload, *ptmp;
+	struct bpf_offload_netdev *ondev;
 
 	ASSERT_RTNL();
 
-	switch (event) {
-	case NETDEV_UNREGISTER:
-		/* ignore namespace changes */
-		if (netdev->reg_state != NETREG_UNREGISTERING)
-			break;
-
-		down_write(&bpf_devs_lock);
-		bpf_offload_orphan_all_progs(netdev);
-		bpf_offload_orphan_all_maps(netdev);
-		up_write(&bpf_devs_lock);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
+	down_write(&bpf_devs_lock);
+	ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+	if (WARN_ON(!ondev))
+		goto unlock;
 
-static struct notifier_block bpf_offload_notifier = {
-	.notifier_call = bpf_offload_notification,
-};
+	WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
 
-static int __init bpf_offload_init(void)
-{
-	register_netdevice_notifier(&bpf_offload_notifier);
-	return 0;
-}
+	list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+		__bpf_prog_offload_destroy(offload->prog);
+	list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+		__bpf_map_offload_destroy(offmap);
 
-subsys_initcall(bpf_offload_init);
+	WARN_ON(!list_empty(&ondev->progs));
+	WARN_ON(!list_empty(&ondev->maps));
+	kfree(ondev);
+unlock:
+	up_write(&bpf_devs_lock);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
-- 
cgit v1.2.3


From 602144c224604f1cbff02ee2d1cf46825269ecbd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 17 Jul 2018 10:53:25 -0700
Subject: bpf: offload: keep the offload state per-ASIC

Create a higher-level entity to represent a device/ASIC to allow
programs and maps to be shared between device ports.  The extra
work is required to make sure we don't destroy BPF objects as
soon as the netdev for which they were loaded gets destroyed,
as other ports may still be using them.  When netdev goes away
all of its BPF objects will be moved to other netdevs of the
device, and only destroyed when last netdev is unregistered.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 14 ++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  4 ++
 drivers/net/netdevsim/bpf.c                   | 17 +++++-
 drivers/net/netdevsim/netdevsim.h             |  3 +
 include/linux/bpf.h                           |  9 ++-
 kernel/bpf/offload.c                          | 81 +++++++++++++++++++++------
 6 files changed, 104 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index dee039ada75c..458f49235d06 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -406,12 +406,16 @@ err_release_free:
 
 static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev)
 {
-	return bpf_offload_dev_netdev_register(netdev);
+	struct nfp_app_bpf *bpf = app->priv;
+
+	return bpf_offload_dev_netdev_register(bpf->bpf_dev, netdev);
 }
 
 static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev)
 {
-	bpf_offload_dev_netdev_unregister(netdev);
+	struct nfp_app_bpf *bpf = app->priv;
+
+	bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev);
 }
 
 static int nfp_bpf_init(struct nfp_app *app)
@@ -437,6 +441,11 @@ static int nfp_bpf_init(struct nfp_app *app)
 	if (err)
 		goto err_free_neutral_maps;
 
+	bpf->bpf_dev = bpf_offload_dev_create();
+	err = PTR_ERR_OR_ZERO(bpf->bpf_dev);
+	if (err)
+		goto err_free_neutral_maps;
+
 	return 0;
 
 err_free_neutral_maps:
@@ -455,6 +464,7 @@ static void nfp_bpf_clean(struct nfp_app *app)
 {
 	struct nfp_app_bpf *bpf = app->priv;
 
+	bpf_offload_dev_destroy(bpf->bpf_dev);
 	WARN_ON(!skb_queue_empty(&bpf->cmsg_replies));
 	WARN_ON(!list_empty(&bpf->map_list));
 	WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 9845c1a2d4c2..bec935468f90 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -110,6 +110,8 @@ enum pkt_vec {
  * struct nfp_app_bpf - bpf app priv structure
  * @app:		backpointer to the app
  *
+ * @bpf_dev:		BPF offload device handle
+ *
  * @tag_allocator:	bitmap of control message tags in use
  * @tag_alloc_next:	next tag bit to allocate
  * @tag_alloc_last:	next tag bit to be freed
@@ -150,6 +152,8 @@ enum pkt_vec {
 struct nfp_app_bpf {
 	struct nfp_app *app;
 
+	struct bpf_offload_dev *bpf_dev;
+
 	DECLARE_BITMAP(tag_allocator, U16_MAX + 1);
 	u16 tag_alloc_next;
 	u16 tag_alloc_last;
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index c4a2829e0e1f..9eab29f67a0e 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -592,11 +592,16 @@ int nsim_bpf_init(struct netdevsim *ns)
 			debugfs_create_dir("bpf_bound_progs", ns->sdev->ddir);
 		if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs))
 			return -ENOMEM;
+
+		ns->sdev->bpf_dev = bpf_offload_dev_create();
+		err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev);
+		if (err)
+			return err;
 	}
 
-	err = bpf_offload_dev_netdev_register(ns->netdev);
+	err = bpf_offload_dev_netdev_register(ns->sdev->bpf_dev, ns->netdev);
 	if (err)
-		return err;
+		goto err_destroy_bdev;
 
 	debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir,
 			   &ns->bpf_offloaded_id);
@@ -624,6 +629,11 @@ int nsim_bpf_init(struct netdevsim *ns)
 			    &ns->bpf_map_accept);
 
 	return 0;
+
+err_destroy_bdev:
+	if (ns->sdev->refcnt == 1)
+		bpf_offload_dev_destroy(ns->sdev->bpf_dev);
+	return err;
 }
 
 void nsim_bpf_uninit(struct netdevsim *ns)
@@ -631,10 +641,11 @@ void nsim_bpf_uninit(struct netdevsim *ns)
 	WARN_ON(ns->xdp.prog);
 	WARN_ON(ns->xdp_hw.prog);
 	WARN_ON(ns->bpf_offloaded);
-	bpf_offload_dev_netdev_unregister(ns->netdev);
+	bpf_offload_dev_netdev_unregister(ns->sdev->bpf_dev, ns->netdev);
 
 	if (ns->sdev->refcnt == 1) {
 		WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs));
 		WARN_ON(!list_empty(&ns->sdev->bpf_bound_maps));
+		bpf_offload_dev_destroy(ns->sdev->bpf_dev);
 	}
 }
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 98f26fa1e671..02be199eb005 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -27,6 +27,7 @@
 #define NSIM_EA(extack, msg)	NL_SET_ERR_MSG_MOD((extack), msg)
 
 struct bpf_prog;
+struct bpf_offload_dev;
 struct dentry;
 struct nsim_vf_config;
 
@@ -36,6 +37,8 @@ struct netdevsim_shared_dev {
 
 	struct dentry *ddir;
 
+	struct bpf_offload_dev *bpf_dev;
+
 	struct dentry *ddir_bpf_bound_progs;
 	u32 prog_id_gen;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aa2e834a122b..913465e45062 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -85,6 +85,7 @@ struct bpf_map {
 	char name[BPF_OBJ_NAME_LEN];
 };
 
+struct bpf_offload_dev;
 struct bpf_offloaded_map;
 
 struct bpf_map_dev_ops {
@@ -650,8 +651,12 @@ int bpf_map_offload_get_next_key(struct bpf_map *map,
 
 bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);
 
-int bpf_offload_dev_netdev_register(struct net_device *netdev);
-void bpf_offload_dev_netdev_unregister(struct net_device *netdev);
+struct bpf_offload_dev *bpf_offload_dev_create(void);
+void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
+int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+				    struct net_device *netdev);
+void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+				       struct net_device *netdev);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index cd64a26807aa..925575f64ff1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -32,11 +32,17 @@
  */
 static DECLARE_RWSEM(bpf_devs_lock);
 
+struct bpf_offload_dev {
+	struct list_head netdevs;
+};
+
 struct bpf_offload_netdev {
 	struct rhash_head l;
 	struct net_device *netdev;
+	struct bpf_offload_dev *offdev;
 	struct list_head progs;
 	struct list_head maps;
+	struct list_head offdev_netdevs;
 };
 
 static const struct rhashtable_params offdevs_params = {
@@ -526,25 +532,18 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
 	return ret;
 }
 
-int bpf_offload_dev_netdev_register(struct net_device *netdev)
+int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+				    struct net_device *netdev)
 {
 	struct bpf_offload_netdev *ondev;
 	int err;
 
-	down_write(&bpf_devs_lock);
-	if (!offdevs_inited) {
-		err = rhashtable_init(&offdevs, &offdevs_params);
-		if (err)
-			return err;
-		offdevs_inited = true;
-	}
-	up_write(&bpf_devs_lock);
-
 	ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
 	if (!ondev)
 		return -ENOMEM;
 
 	ondev->netdev = netdev;
+	ondev->offdev = offdev;
 	INIT_LIST_HEAD(&ondev->progs);
 	INIT_LIST_HEAD(&ondev->maps);
 
@@ -555,6 +554,7 @@ int bpf_offload_dev_netdev_register(struct net_device *netdev)
 		goto err_unlock_free;
 	}
 
+	list_add(&ondev->offdev_netdevs, &offdev->netdevs);
 	up_write(&bpf_devs_lock);
 	return 0;
 
@@ -565,11 +565,12 @@ err_unlock_free:
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
 
-void bpf_offload_dev_netdev_unregister(struct net_device *netdev)
+void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+				       struct net_device *netdev)
 {
+	struct bpf_offload_netdev *ondev, *altdev;
 	struct bpf_offloaded_map *offmap, *mtmp;
 	struct bpf_prog_offload *offload, *ptmp;
-	struct bpf_offload_netdev *ondev;
 
 	ASSERT_RTNL();
 
@@ -579,11 +580,26 @@ void bpf_offload_dev_netdev_unregister(struct net_device *netdev)
 		goto unlock;
 
 	WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
-
-	list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
-		__bpf_prog_offload_destroy(offload->prog);
-	list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
-		__bpf_map_offload_destroy(offmap);
+	list_del(&ondev->offdev_netdevs);
+
+	/* Try to move the objects to another netdev of the device */
+	altdev = list_first_entry_or_null(&offdev->netdevs,
+					  struct bpf_offload_netdev,
+					  offdev_netdevs);
+	if (altdev) {
+		list_for_each_entry(offload, &ondev->progs, offloads)
+			offload->netdev = altdev->netdev;
+		list_splice_init(&ondev->progs, &altdev->progs);
+
+		list_for_each_entry(offmap, &ondev->maps, offloads)
+			offmap->netdev = altdev->netdev;
+		list_splice_init(&ondev->maps, &altdev->maps);
+	} else {
+		list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+			__bpf_prog_offload_destroy(offload->prog);
+		list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+			__bpf_map_offload_destroy(offmap);
+	}
 
 	WARN_ON(!list_empty(&ondev->progs));
 	WARN_ON(!list_empty(&ondev->maps));
@@ -592,3 +608,34 @@ unlock:
 	up_write(&bpf_devs_lock);
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
+
+struct bpf_offload_dev *bpf_offload_dev_create(void)
+{
+	struct bpf_offload_dev *offdev;
+	int err;
+
+	down_write(&bpf_devs_lock);
+	if (!offdevs_inited) {
+		err = rhashtable_init(&offdevs, &offdevs_params);
+		if (err)
+			return ERR_PTR(err);
+		offdevs_inited = true;
+	}
+	up_write(&bpf_devs_lock);
+
+	offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
+	if (!offdev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&offdev->netdevs);
+
+	return offdev;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_create);
+
+void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
+{
+	WARN_ON(!list_empty(&offdev->netdevs));
+	kfree(offdev);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
-- 
cgit v1.2.3


From fd4f227dea0f24d89f52f7c4eb3207f84ddcbcbd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 17 Jul 2018 10:53:26 -0700
Subject: bpf: offload: allow program and map sharing per-ASIC

Allow programs and maps to be re-used across different netdevs,
as long as they belong to the same struct bpf_offload_dev.
Update the bpf_offload_prog_map_match() helper for the verifier
and export a new helper for the drivers to use when checking
programs at attachment time.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  1 +
 kernel/bpf/offload.c | 42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 913465e45062..5b5ad95cf339 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -657,6 +657,7 @@ int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
 				    struct net_device *netdev);
 void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
 				       struct net_device *netdev);
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 925575f64ff1..177a52436394 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -511,22 +511,50 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
 	return 0;
 }
 
-bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
+static bool __bpf_offload_dev_match(struct bpf_prog *prog,
+				    struct net_device *netdev)
 {
-	struct bpf_offloaded_map *offmap;
+	struct bpf_offload_netdev *ondev1, *ondev2;
 	struct bpf_prog_offload *offload;
-	bool ret;
 
 	if (!bpf_prog_is_dev_bound(prog->aux))
 		return false;
-	if (!bpf_map_is_dev_bound(map))
-		return bpf_map_offload_neutral(map);
 
-	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
+	if (!offload)
+		return false;
+	if (offload->netdev == netdev)
+		return true;
+
+	ondev1 = bpf_offload_find_netdev(offload->netdev);
+	ondev2 = bpf_offload_find_netdev(netdev);
+
+	return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
+{
+	bool ret;
+
+	down_read(&bpf_devs_lock);
+	ret = __bpf_offload_dev_match(prog, netdev);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+
+bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap;
+	bool ret;
+
+	if (!bpf_map_is_dev_bound(map))
+		return bpf_map_offload_neutral(map);
 	offmap = map_to_offmap(map);
 
-	ret = offload && offload->netdev == offmap->netdev;
+	down_read(&bpf_devs_lock);
+	ret = __bpf_offload_dev_match(prog, offmap->netdev);
 	up_read(&bpf_devs_lock);
 
 	return ret;
-- 
cgit v1.2.3


From 3f289dcb4b265416a57ca79cf4a324060bb09060 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Jul 2018 04:47:36 -0700
Subject: block: make bdev_ops->rw_page() take a REQ_OP instead of bool

c11f0c0b5bb9 ("block/mm: make bdev_ops->rw_page() take a bool for
read/write") replaced @op with boolean @is_write, which limited the
amount of information going into ->rw_page() and more importantly
page_endio(), which removed the need to expose block internals to mm.

Unfortunately, we want to track discards separately and @is_write
isn't enough information.  This patch updates bdev_ops->rw_page() to
take REQ_OP instead but leaves page_endio() to take bool @is_write.
This allows the block part of operations to have enough information
while not leaking it to mm.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/brd.c           | 14 +++++++-------
 drivers/block/zram/zram_drv.c | 16 ++++++++--------
 drivers/nvdimm/btt.c          | 12 ++++++------
 drivers/nvdimm/pmem.c         | 13 ++++++-------
 fs/block_dev.c                |  6 ++++--
 fs/mpage.c                    |  4 ++--
 include/linux/blkdev.h        |  2 +-
 7 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bb976598ee43..df8103dd40ac 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
  * Process a single bvec of a bio.
  */
 static int brd_do_bvec(struct brd_device *brd, struct page *page,
-			unsigned int len, unsigned int off, bool is_write,
+			unsigned int len, unsigned int off, unsigned int op,
 			sector_t sector)
 {
 	void *mem;
 	int err = 0;
 
-	if (is_write) {
+	if (op_is_write(op)) {
 		err = copy_to_brd_setup(brd, sector, len);
 		if (err)
 			goto out;
 	}
 
 	mem = kmap_atomic(page);
-	if (!is_write) {
+	if (!op_is_write(op)) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
 	} else {
@@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
 		int err;
 
 		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
-					op_is_write(bio_op(bio)), sector);
+				  bio_op(bio), sector);
 		if (err)
 			goto io_error;
 		sector += len >> SECTOR_SHIFT;
@@ -310,15 +310,15 @@ io_error:
 }
 
 static int brd_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, bool is_write)
+		       struct page *page, unsigned int op)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	int err;
 
 	if (PageTransHuge(page))
 		return -ENOTSUPP;
-	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
-	page_endio(page, is_write, err);
+	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op_is_write(op), err);
 	return err;
 }
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7436b2d27fa3..78c29044684a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1274,17 +1274,17 @@ static void zram_bio_discard(struct zram *zram, u32 index,
  * Returns 1 if IO request was successfully submitted.
  */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, bool is_write, struct bio *bio)
+			int offset, unsigned int op, struct bio *bio)
 {
 	unsigned long start_time = jiffies;
-	int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
+	int rw_acct = op_is_write(op) ? REQ_OP_WRITE : REQ_OP_READ;
 	struct request_queue *q = zram->disk->queue;
 	int ret;
 
 	generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
 			&zram->disk->part0);
 
-	if (!is_write) {
+	if (!op_is_write(op)) {
 		atomic64_inc(&zram->stats.num_reads);
 		ret = zram_bvec_read(zram, bvec, index, offset, bio);
 		flush_dcache_page(bvec->bv_page);
@@ -1300,7 +1300,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 	zram_slot_unlock(zram, index);
 
 	if (unlikely(ret < 0)) {
-		if (!is_write)
+		if (!op_is_write(op))
 			atomic64_inc(&zram->stats.failed_reads);
 		else
 			atomic64_inc(&zram->stats.failed_writes);
@@ -1338,7 +1338,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 			bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
 							unwritten);
 			if (zram_bvec_rw(zram, &bv, index, offset,
-					op_is_write(bio_op(bio)), bio) < 0)
+					 bio_op(bio), bio) < 0)
 				goto out;
 
 			bv.bv_offset += bv.bv_len;
@@ -1390,7 +1390,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 }
 
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, bool is_write)
+		       struct page *page, unsigned int op)
 {
 	int offset, ret;
 	u32 index;
@@ -1414,7 +1414,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+	ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
 out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
@@ -1429,7 +1429,7 @@ out:
 
 	switch (ret) {
 	case 0:
-		page_endio(page, is_write, 0);
+		page_endio(page, op_is_write(op), 0);
 		break;
 	case 1:
 		ret = 0;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 85de8053aa34..0360c015f658 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 
 static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
 			struct page *page, unsigned int len, unsigned int off,
-			bool is_write, sector_t sector)
+			unsigned int op, sector_t sector)
 {
 	int ret;
 
-	if (!is_write) {
+	if (!op_is_write(op)) {
 		ret = btt_read_pg(btt, bip, page, off, sector, len);
 		flush_dcache_page(page);
 	} else {
@@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 		}
 
 		err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
-				  op_is_write(bio_op(bio)), iter.bi_sector);
+				  bio_op(bio), iter.bi_sector);
 		if (err) {
 			dev_err(&btt->nd_btt->dev,
 					"io error in %s sector %lld, len %d,\n",
@@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 }
 
 static int btt_rw_page(struct block_device *bdev, sector_t sector,
-		struct page *page, bool is_write)
+		struct page *page, unsigned int op)
 {
 	struct btt *btt = bdev->bd_disk->private_data;
 	int rc;
 	unsigned int len;
 
 	len = hpage_nr_pages(page) * PAGE_SIZE;
-	rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
+	rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
 	if (rc == 0)
-		page_endio(page, is_write, 0);
+		page_endio(page, op_is_write(op), 0);
 
 	return rc;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8b1fd7f1a224..dd17acd8fe68 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
 }
 
 static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
-			unsigned int len, unsigned int off, bool is_write,
+			unsigned int len, unsigned int off, unsigned int op,
 			sector_t sector)
 {
 	blk_status_t rc = BLK_STS_OK;
@@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
 		bad_pmem = true;
 
-	if (!is_write) {
+	if (!op_is_write(op)) {
 		if (unlikely(bad_pmem))
 			rc = BLK_STS_IOERR;
 		else {
@@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
 		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
-				bvec.bv_offset, op_is_write(bio_op(bio)),
-				iter.bi_sector);
+				bvec.bv_offset, bio_op(bio), iter.bi_sector);
 		if (rc) {
 			bio->bi_status = rc;
 			break;
@@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, bool is_write)
+		       struct page *page, unsigned int op)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	blk_status_t rc;
 
 	rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
-			  0, is_write, sector);
+			  0, op, sector);
 
 	/*
 	 * The ->rw_page interface is subtle and tricky.  The core
@@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	 * caused by double completion.
 	 */
 	if (rc == 0)
-		page_endio(page, is_write, 0);
+		page_endio(page, op_is_write(op), 0);
 
 	return blk_status_to_errno(rc);
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0dd87aaeb39a..496fb51a1e1a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -665,7 +665,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
-	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+			      REQ_OP_READ);
 	blk_queue_exit(bdev->bd_queue);
 	return result;
 }
@@ -703,7 +704,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 		return result;
 
 	set_page_writeback(page);
-	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+			      REQ_OP_WRITE);
 	if (result) {
 		end_page_writeback(page);
 	} else {
diff --git a/fs/mpage.c b/fs/mpage.c
index b7e7f570733a..b73638db9866 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		page_endio(page, op_is_write(bio_op(bio)),
-				blk_status_to_errno(bio->bi_status));
+		page_endio(page, bio_op(bio),
+			   blk_status_to_errno(bio->bi_status));
 	}
 
 	bio_put(bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1939ed95f936..331a6cb8805f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1943,7 +1943,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
-	int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
+	int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	unsigned int (*check_events) (struct gendisk *disk,
-- 
cgit v1.2.3


From 59767fbd49d794b4499d30b314df6c0d4aca584b Mon Sep 17 00:00:00 2001
From: Michael Callahan <michaelcallahan@fb.com>
Date: Wed, 18 Jul 2018 04:47:37 -0700
Subject: block: Add part_stat_read_accum to read across field entries.

Add a part_stat_read_accum macro to genhd.h to read and sum across
field entries.  For example to sum up the number read and write
sectors completed.  In addition to being ar reasonable cleanup by
itself this will make it easier to add new stat fields in the future.

tj: Refreshed on top of v4.17.

Signed-off-by: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 3 +--
 drivers/block/drbd/drbd_worker.c   | 4 +---
 drivers/md/md.c                    | 3 +--
 include/linux/genhd.h              | 4 ++++
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index a36a30795c43..75f6b47169e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	if (c_min_rate == 0)
 		return false;
 
-	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-		      (int)part_stat_read(&disk->part0, sectors[1]) -
+	curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
 			atomic_read(&device->rs_sect_ev);
 
 	if (atomic_read(&device->ap_actlog_cnt)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5e793dd7adfb..b8f77e83d456 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
 	atomic_set(&device->rs_sect_in, 0);
 	atomic_set(&device->rs_sect_ev, 0);
 	device->rs_in_flight = 0;
-	device->rs_last_events =
-		(int)part_stat_read(&disk->part0, sectors[0]) +
-		(int)part_stat_read(&disk->part0, sectors[1]);
+	device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
 
 	/* Updating the RCU protected object in place is necessary since
 	   this function gets called from atomic context.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 994aed2f9dff..dabe36723d60 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8046,8 +8046,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-			      (int)part_stat_read(&disk->part0, sectors[1]) -
+		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
 			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6cb8a5789668..19f36fa10995 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -353,6 +353,10 @@ static inline void free_part_stats(struct hd_struct *part)
 
 #endif /* CONFIG_SMP */
 
+#define part_stat_read_accum(part, field)				\
+	(part_stat_read(part, field[0]) +				\
+	 part_stat_read(part, field[1]))
+
 #define part_stat_add(cpu, part, field, addnd)	do {			\
 	__part_stat_add((cpu), (part), field, addnd);			\
 	if ((part)->partno)						\
-- 
cgit v1.2.3


From dbae2c551377b6533a00c11fc7ede370100ab404 Mon Sep 17 00:00:00 2001
From: Michael Callahan <michaelcallahan@fb.com>
Date: Wed, 18 Jul 2018 04:47:38 -0700
Subject: block: Define and use STAT_READ and STAT_WRITE

Add defines for STAT_READ and STAT_WRITE for indexing the partition
stat entries. This clarifies some fs/ code which has hardcoded 1 for
STAT_WRITE and will make it easier to extend the stats with additional
fields.

tj: Refreshed on top of v4.17.

Signed-off-by: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c             | 16 ++++++++--------
 block/partition-generic.c | 16 ++++++++--------
 fs/ext4/super.c           |  5 +++--
 fs/ext4/sysfs.c           |  6 ++++--
 fs/f2fs/f2fs.h            |  2 +-
 fs/f2fs/super.c           |  3 ++-
 include/linux/blk_types.h |  7 +++++++
 include/linux/genhd.h     | 13 +++++++------
 8 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index f1543a45e73b..0711a800d0d4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1337,14 +1337,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%u %lu %lu %lu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
-			   part_stat_read(hd, ios[READ]),
-			   part_stat_read(hd, merges[READ]),
-			   part_stat_read(hd, sectors[READ]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
-			   part_stat_read(hd, ios[WRITE]),
-			   part_stat_read(hd, merges[WRITE]),
-			   part_stat_read(hd, sectors[WRITE]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
+			   part_stat_read(hd, ios[STAT_READ]),
+			   part_stat_read(hd, merges[STAT_READ]),
+			   part_stat_read(hd, sectors[STAT_READ]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
+			   part_stat_read(hd, ios[STAT_WRITE]),
+			   part_stat_read(hd, merges[STAT_WRITE]),
+			   part_stat_read(hd, sectors[STAT_WRITE]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
 			   inflight[0],
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3dcfd4ec0e11..0ddb06722162 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -132,14 +132,14 @@ ssize_t part_stat_show(struct device *dev,
 		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u"
 		"\n",
-		part_stat_read(p, ios[READ]),
-		part_stat_read(p, merges[READ]),
-		(unsigned long long)part_stat_read(p, sectors[READ]),
-		jiffies_to_msecs(part_stat_read(p, ticks[READ])),
-		part_stat_read(p, ios[WRITE]),
-		part_stat_read(p, merges[WRITE]),
-		(unsigned long long)part_stat_read(p, sectors[WRITE]),
-		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+		part_stat_read(p, ios[STAT_READ]),
+		part_stat_read(p, merges[STAT_READ]),
+		(unsigned long long)part_stat_read(p, sectors[STAT_READ]),
+		jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
+		part_stat_read(p, ios[STAT_WRITE]),
+		part_stat_read(p, merges[STAT_WRITE]),
+		(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
+		jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
 		inflight[0],
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba2396a7bd04..4b8aef989552 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3514,7 +3514,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_sb_block = sb_block;
 	if (sb->s_bdev->bd_part)
 		sbi->s_sectors_written_start =
-			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+			part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
 
 	/* Cleanup superblock name */
 	strreplace(sb->s_id, '/', '!');
@@ -4824,7 +4824,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	if (sb->s_bdev->bd_part)
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			    ((part_stat_read(sb->s_bdev->bd_part,
+					     sectors[STAT_WRITE]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	else
 		es->s_kbytes_written =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index f34da0bb8f17..2be9ad790017 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -56,7 +56,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
 	if (!sb->s_bdev->bd_part)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%lu\n",
-			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			(part_stat_read(sb->s_bdev->bd_part,
+					sectors[STAT_WRITE]) -
 			 sbi->s_sectors_written_start) >> 1);
 }
 
@@ -68,7 +69,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
 		return snprintf(buf, PAGE_SIZE, "0\n");
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
 			(unsigned long long)(sbi->s_kbytes_written +
-			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			((part_stat_read(sb->s_bdev->bd_part,
+					 sectors[STAT_WRITE]) -
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4d8b1de83143..6799c3fc44e3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
 #define BD_PART_WRITTEN(s)						 \
-(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) -		 \
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) -   \
 		(s)->sectors_written_start) >> 1)
 
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3995e926ba3a..17bcff789c08 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2882,7 +2882,8 @@ try_onemore:
 	/* For write statistics */
 	if (sb->s_bdev->bd_part)
 		sbi->sectors_written_start =
-			(u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+			(u64)part_stat_read(sb->s_bdev->bd_part,
+					    sectors[STAT_WRITE]);
 
 	/* Read accumulated write IO statistics if exists */
 	seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e13449a379a1..d2b44de56bc1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -357,6 +357,13 @@ enum req_flag_bits {
 #define REQ_NOMERGE_FLAGS \
 	(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
 
+enum stat_group {
+	STAT_READ,
+	STAT_WRITE,
+
+	NR_STAT_GROUPS
+};
+
 #define bio_op(bio) \
 	((bio)->bi_opf & REQ_OP_MASK)
 #define req_op(req) \
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 19f36fa10995..a75445446974 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
+#include <linux/blk_types.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -82,10 +83,10 @@ struct partition {
 } __attribute__((packed));
 
 struct disk_stats {
-	unsigned long sectors[2];	/* READs and WRITEs */
-	unsigned long ios[2];
-	unsigned long merges[2];
-	unsigned long ticks[2];
+	unsigned long sectors[NR_STAT_GROUPS];
+	unsigned long ios[NR_STAT_GROUPS];
+	unsigned long merges[NR_STAT_GROUPS];
+	unsigned long ticks[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
 };
@@ -354,8 +355,8 @@ static inline void free_part_stats(struct hd_struct *part)
 #endif /* CONFIG_SMP */
 
 #define part_stat_read_accum(part, field)				\
-	(part_stat_read(part, field[0]) +				\
-	 part_stat_read(part, field[1]))
+	(part_stat_read(part, field[STAT_READ]) +			\
+	 part_stat_read(part, field[STAT_WRITE]))
 
 #define part_stat_add(cpu, part, field, addnd)	do {			\
 	__part_stat_add((cpu), (part), field, addnd);			\
-- 
cgit v1.2.3


From ddcf35d397976421a4ec1d0d00fbcc027a8cb034 Mon Sep 17 00:00:00 2001
From: Michael Callahan <michaelcallahan@fb.com>
Date: Wed, 18 Jul 2018 04:47:39 -0700
Subject: block: Add and use op_stat_group() for indexing disk_stat fields.

Add and use a new op_stat_group() function for indexing partition stat
fields rather than indexing them by rq_data_dir() or bio_data_dir().
This function works similarly to op_is_sync() in that it takes the
request::cmd_flags or bio::bi_opf flags and determines which stats
should et updated.

In addition, the second parameter to generic_start_io_acct() and
generic_end_io_acct() is now a REQ_OP rather than simply a read or
write bit and it uses op_stat_group() on the parameter to determine
the stat group.

Note that the partition in_flight counts are not part of the per-cpu
statistics and as such are not indexed via this function.  It's now
indexed by op_is_write().

tj: Refreshed on top of v4.17.  Updated to pass around REQ_OP.

Signed-off-by: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joshua Morris <josh.h.morris@us.ibm.com>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Matias Bjorling <mb@lightnvm.io>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                   | 16 +++++++++-------
 block/blk-core.c              | 12 ++++++------
 drivers/block/drbd/drbd_req.c |  4 ++--
 drivers/block/rsxx/dev.c      |  6 +++---
 drivers/block/zram/zram_drv.c |  5 ++---
 drivers/lightnvm/pblk-cache.c |  5 +++--
 drivers/lightnvm/pblk-read.c  |  5 +++--
 drivers/md/bcache/request.c   | 13 +++++--------
 drivers/md/dm.c               |  6 ++++--
 drivers/md/md.c               |  5 +++--
 drivers/nvdimm/nd.h           |  7 +++----
 include/linux/bio.h           |  4 ++--
 include/linux/blk_types.h     |  5 +++++
 13 files changed, 50 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index f3536bfc8298..8ecc95615941 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1728,29 +1728,31 @@ void bio_check_pages_dirty(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
 			   unsigned long sectors, struct hd_struct *part)
 {
+	const int sgrp = op_stat_group(op);
 	int cpu = part_stat_lock();
 
 	part_round_stats(q, cpu, part);
-	part_stat_inc(cpu, part, ios[rw]);
-	part_stat_add(cpu, part, sectors[rw], sectors);
-	part_inc_in_flight(q, part, rw);
+	part_stat_inc(cpu, part, ios[sgrp]);
+	part_stat_add(cpu, part, sectors[sgrp], sectors);
+	part_inc_in_flight(q, part, op_is_write(op));
 
 	part_stat_unlock();
 }
 EXPORT_SYMBOL(generic_start_io_acct);
 
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int req_op,
 			 struct hd_struct *part, unsigned long start_time)
 {
 	unsigned long duration = jiffies - start_time;
+	const int sgrp = op_stat_group(req_op);
 	int cpu = part_stat_lock();
 
-	part_stat_add(cpu, part, ticks[rw], duration);
+	part_stat_add(cpu, part, ticks[sgrp], duration);
 	part_round_stats(q, cpu, part);
-	part_dec_in_flight(q, part, rw);
+	part_dec_in_flight(q, part, op_is_write(req_op));
 
 	part_stat_unlock();
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index c4b57d8806fe..03a4ea93a5f3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2702,13 +2702,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
-		const int rw = rq_data_dir(req);
+		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
 		int cpu;
 
 		cpu = part_stat_lock();
 		part = req->part;
-		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+		part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
 		part_stat_unlock();
 	}
 }
@@ -2722,7 +2722,7 @@ void blk_account_io_done(struct request *req, u64 now)
 	 */
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		unsigned long duration;
-		const int rw = rq_data_dir(req);
+		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
 		int cpu;
 
@@ -2730,10 +2730,10 @@ void blk_account_io_done(struct request *req, u64 now)
 		cpu = part_stat_lock();
 		part = req->part;
 
-		part_stat_inc(cpu, part, ios[rw]);
-		part_stat_add(cpu, part, ticks[rw], duration);
+		part_stat_inc(cpu, part, ios[sgrp]);
+		part_stat_add(cpu, part, ticks[sgrp], duration);
 		part_round_stats(req->q, cpu, part);
-		part_dec_in_flight(req->q, part, rw);
+		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
 		part_stat_unlock();
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d146fedc38bb..19cac36e9737 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
 {
 	struct request_queue *q = device->rq_queue;
 
-	generic_start_io_acct(q, bio_data_dir(req->master_bio),
+	generic_start_io_acct(q, bio_op(req->master_bio),
 				req->i.size >> 9, &device->vdisk->part0);
 }
 
@@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
 {
 	struct request_queue *q = device->rq_queue;
 
-	generic_end_io_acct(q, bio_data_dir(req->master_bio),
+	generic_end_io_acct(q, bio_op(req->master_bio),
 			    &device->vdisk->part0, req->start_jif);
 }
 
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index dddb3f2490b6..1a92f9e65937 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
 
 static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
 {
-	generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio),
+	generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
 			     &card->gendisk->part0);
 }
 
@@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
 				struct bio *bio,
 				unsigned long start_time)
 {
-	generic_end_io_acct(card->queue, bio_data_dir(bio),
-				&card->gendisk->part0, start_time);
+	generic_end_io_acct(card->queue, bio_op(bio),
+			    &card->gendisk->part0, start_time);
 }
 
 static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 78c29044684a..2907a8156aaf 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1277,11 +1277,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 			int offset, unsigned int op, struct bio *bio)
 {
 	unsigned long start_time = jiffies;
-	int rw_acct = op_is_write(op) ? REQ_OP_WRITE : REQ_OP_READ;
 	struct request_queue *q = zram->disk->queue;
 	int ret;
 
-	generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+	generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
 			&zram->disk->part0);
 
 	if (!op_is_write(op)) {
@@ -1293,7 +1292,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		ret = zram_bvec_write(zram, bvec, index, offset, bio);
 	}
 
-	generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+	generic_end_io_acct(q, op, &zram->disk->part0, start_time);
 
 	zram_slot_lock(zram, index);
 	zram_accessed(zram, index);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 77d811962818..f565a56b898a 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
 	int nr_entries = pblk_get_secs(bio);
 	int i, ret;
 
-	generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
+	generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
+			      &pblk->disk->part0);
 
 	/* Update the write buffer head (mem) with the entries that we can
 	 * write. The write in itself cannot fail, so there is no need to
@@ -75,7 +76,7 @@ retry:
 	pblk_rl_inserted(&pblk->rl, nr_entries);
 
 out:
-	generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
+	generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
 	pblk_write_should_kick(pblk);
 	return ret;
 }
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 26d414ae25b6..5a46d7f9302f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
 	struct bio *int_bio = rqd->bio;
 	unsigned long start_time = r_ctx->start_time;
 
-	generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
+	generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
 
 	if (rqd->error)
 		pblk_log_read_err(pblk, rqd);
@@ -461,7 +461,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		return NVM_IO_ERR;
 	}
 
-	generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
+	generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
+			      &pblk->disk->part0);
 
 	bitmap_zero(read_bitmap, nr_secs);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..97707b0c54ce 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
-		generic_end_io_acct(s->d->disk->queue,
-				    bio_data_dir(s->orig_bio),
+		generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
@@ -1062,8 +1061,7 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_end_io = ddip->bi_end_io;
 	bio->bi_private = ddip->bi_private;
 
-	generic_end_io_acct(ddip->d->disk->queue,
-			    bio_data_dir(bio),
+	generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
 			    &ddip->d->disk->part0, ddip->start_time);
 
 	if (bio->bi_status) {
@@ -1120,7 +1118,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 	}
 
 	atomic_set(&dc->backing_idle, 0);
-	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+	generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
 
 	bio_set_dev(bio, dc->bdev);
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1227,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 	struct search *s;
 	struct closure *cl;
 	struct bcache_device *d = bio->bi_disk->private_data;
-	int rw = bio_data_dir(bio);
 
 	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
 		bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1234,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 		return BLK_QC_T_NONE;
 	}
 
-	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+	generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
 
 	s = search_alloc(bio, d);
 	cl = &s->cl;
@@ -1254,7 +1251,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 				      flash_dev_nodata,
 				      bcache_wq);
 		return BLK_QC_T_NONE;
-	} else if (rw) {
+	} else if (bio_data_dir(bio)) {
 		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_iter.bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b0dd7027848b..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
 
 	io->start_time = jiffies;
 
-	generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+			      &dm_disk(md)->part0);
 
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 		   atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
 	int pending;
 	int rw = bio_data_dir(bio);
 
-	generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+			    io->start_time);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dabe36723d60..f6e58dbca0d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -335,6 +335,7 @@ EXPORT_SYMBOL(md_handle_request);
 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
+	const int sgrp = op_stat_group(bio_op(bio));
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
 	int cpu;
@@ -363,8 +364,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	md_handle_request(mddev, bio);
 
 	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
 	part_stat_unlock();
 
 	return BLK_QC_T_NONE;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 32e0364b48b9..6ee7fd7e4bbd 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 		return false;
 
 	*start = jiffies;
-	generic_start_io_acct(disk->queue, bio_data_dir(bio),
-			      bio_sectors(bio), &disk->part0);
+	generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
+			      &disk->part0);
 	return true;
 }
 static inline void nd_iostat_end(struct bio *bio, unsigned long start)
 {
 	struct gendisk *disk = bio->bi_disk;
 
-	generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
-				start);
+	generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
 }
 static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
 		unsigned int len)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a00dfff51aa5..ab221c517f4e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -496,9 +496,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
 				unsigned long sectors, struct hd_struct *part);
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int op,
 				struct hd_struct *part,
 				unsigned long start_time);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d2b44de56bc1..2960a96d833c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -401,6 +401,11 @@ static inline bool op_is_sync(unsigned int op)
 		(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
 }
 
+static inline int op_stat_group(unsigned int op)
+{
+	return op_is_write(op);
+}
+
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE		-1U
 #define BLK_QC_T_SHIFT		16
-- 
cgit v1.2.3


From bdca3c87fb7ad1cc61d231d37eb0d8f90d001e0c Mon Sep 17 00:00:00 2001
From: Michael Callahan <michaelcallahan@fb.com>
Date: Wed, 18 Jul 2018 04:47:40 -0700
Subject: block: Track DISCARD statistics and output them in stat and diskstat

Add tracking of REQ_OP_DISCARD ios to the partition statistics and
append them to the various stat files in /sys as well as
/proc/diskstats.  These are tracked with the same four stats as reads
and writes:

Number of discard ios completed.
Number of discard ios merged
Number of discard sectors completed
Milliseconds spent on discard requests

This is done via adding a new STAT_DISCARD define to genhd.h and then
using it to index that stat field for discard requests.

tj: Refreshed on top of v4.17 and other previous updates.

Signed-off-by: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andy Newell <newella@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/procfs-diskstats | 10 ++++++++++
 Documentation/block/stat.txt               | 28 ++++++++++++++++------------
 Documentation/iostats.txt                  | 15 +++++++++++++++
 block/genhd.c                              | 13 ++++++++++---
 block/partition-generic.c                  |  9 +++++++--
 include/linux/blk_types.h                  |  8 ++++++++
 include/linux/genhd.h                      |  3 ++-
 7 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index f91a973a37fe..abac31d216de 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -5,6 +5,7 @@ Description:
 		The /proc/diskstats file displays the I/O statistics
 		of block devices. Each line contains the following 14
 		fields:
+
 		 1 - major number
 		 2 - minor mumber
 		 3 - device name
@@ -19,4 +20,13 @@ Description:
 		12 - I/Os currently in progress
 		13 - time spent doing I/Os (ms)
 		14 - weighted time spent doing I/Os (ms)
+
+		Kernel 4.18+ appends four more fields for discard
+		tracking putting the total at 18:
+
+		15 - discards completed successfully
+		16 - discards merged
+		17 - sectors discarded
+		18 - time spent discarding
+
 		For more details refer to Documentation/iostats.txt
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
index 0dbc946de2ea..0aace9cc536c 100644
--- a/Documentation/block/stat.txt
+++ b/Documentation/block/stat.txt
@@ -31,28 +31,32 @@ write ticks     milliseconds  total wait time for write requests
 in_flight       requests      number of I/Os currently in flight
 io_ticks        milliseconds  total time this block device has been active
 time_in_queue   milliseconds  total wait time for all requests
+discard I/Os    requests      number of discard I/Os processed
+discard merges  requests      number of discard I/Os merged with in-queue I/O
+discard sectors sectors       number of sectors discarded
+discard ticks   milliseconds  total wait time for discard requests
 
-read I/Os, write I/Os
-=====================
+read I/Os, write I/Os, discard I/0s
+===================================
 
 These values increment when an I/O request completes.
 
-read merges, write merges
-=========================
+read merges, write merges, discard merges
+=========================================
 
 These values increment when an I/O request is merged with an
 already-queued I/O request.
 
-read sectors, write sectors
-===========================
+read sectors, write sectors, discard_sectors
+============================================
 
-These values count the number of sectors read from or written to this
-block device.  The "sectors" in question are the standard UNIX 512-byte
-sectors, not any device- or filesystem-specific block size.  The
-counters are incremented when the I/O completes.
+These values count the number of sectors read from, written to, or
+discarded from this block device.  The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size.  The counters are incremented when the I/O completes.
 
-read ticks, write ticks
-=======================
+read ticks, write ticks, discard ticks
+======================================
 
 These values count the number of milliseconds that I/O requests have
 waited on this block device.  If there are multiple I/O requests waiting,
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index 04d394a2e06c..49df45f90e8a 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -31,6 +31,9 @@ Here are examples of these different formats::
       3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
       3    1   hda1 35486 38030 38030 38030
 
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
 On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
 a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
 
@@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
     last update of this field.  This can provide an easy measure of both
     I/O completion time and the backlog that may be accumulating.
 
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
 
 To avoid introducing performance bottlenecks, no locks are held while
 modifying these counters.  This implies that minor inaccuracies may be
diff --git a/block/genhd.c b/block/genhd.c
index 0711a800d0d4..8cc719a37b32 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1333,8 +1333,11 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		part_round_stats(gp->queue, cpu, hd);
 		part_stat_unlock();
 		part_in_flight(gp->queue, hd, inflight);
-		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
-			   "%u %lu %lu %lu %u %u %u %u\n",
+		seq_printf(seqf, "%4d %7d %s "
+			   "%lu %lu %lu %u "
+			   "%lu %lu %lu %u "
+			   "%u %u %u "
+			   "%lu %lu %lu %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[STAT_READ]),
@@ -1347,7 +1350,11 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
 			   inflight[0],
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
-			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+			   jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
+			   part_stat_read(hd, ios[STAT_DISCARD]),
+			   part_stat_read(hd, merges[STAT_DISCARD]),
+			   part_stat_read(hd, sectors[STAT_DISCARD]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
 			);
 	}
 	disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0ddb06722162..5a8975a1201c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -130,7 +130,8 @@ ssize_t part_stat_show(struct device *dev,
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
-		"%8u %8u %8u"
+		"%8u %8u %8u "
+		"%8lu %8lu %8llu %8u"
 		"\n",
 		part_stat_read(p, ios[STAT_READ]),
 		part_stat_read(p, merges[STAT_READ]),
@@ -142,7 +143,11 @@ ssize_t part_stat_show(struct device *dev,
 		jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
 		inflight[0],
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
-		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+		jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+		part_stat_read(p, ios[STAT_DISCARD]),
+		part_stat_read(p, merges[STAT_DISCARD]),
+		(unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
+		jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
 }
 
 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 2960a96d833c..f6dfb30737d8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -360,6 +360,7 @@ enum req_flag_bits {
 enum stat_group {
 	STAT_READ,
 	STAT_WRITE,
+	STAT_DISCARD,
 
 	NR_STAT_GROUPS
 };
@@ -401,8 +402,15 @@ static inline bool op_is_sync(unsigned int op)
 		(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
 }
 
+static inline bool op_is_discard(unsigned int op)
+{
+	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
+}
+
 static inline int op_stat_group(unsigned int op)
 {
+	if (op_is_discard(op))
+		return STAT_DISCARD;
 	return op_is_write(op);
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a75445446974..57864422a2c8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -356,7 +356,8 @@ static inline void free_part_stats(struct hd_struct *part)
 
 #define part_stat_read_accum(part, field)				\
 	(part_stat_read(part, field[STAT_READ]) +			\
-	 part_stat_read(part, field[STAT_WRITE]))
+	 part_stat_read(part, field[STAT_WRITE]) +			\
+	 part_stat_read(part, field[STAT_DISCARD]))
 
 #define part_stat_add(cpu, part, field, addnd)	do {			\
 	__part_stat_add((cpu), (part), field, addnd);			\
-- 
cgit v1.2.3


From 636620b66d5d4012c4a9c86206013964d3986c4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 18 Jul 2018 04:47:41 -0700
Subject: blkcg: Track DISCARD statistics and output them in cgroup io.stat

Add tracking of REQ_OP_DISCARD ios to the per-cgroup io.stat.  Two
fields, dbytes and dios, to respectively count the total bytes and
number of discards are added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andy Newell <newella@fb.com>
Cc: Michael Callahan <michaelcallahan@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/admin-guide/cgroup-v2.rst | 10 ++++++----
 block/blk-cgroup.c                      | 13 +++++++++----
 include/linux/blk-cgroup.h              |  5 ++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 569ce27b85e5..3afe10fa82bc 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1317,17 +1317,19 @@ IO Interface Files
 	Lines are keyed by $MAJ:$MIN device numbers and not ordered.
 	The following nested keys are defined.
 
-	  ======	===================
+	  ======	=====================
 	  rbytes	Bytes read
 	  wbytes	Bytes written
 	  rios		Number of read IOs
 	  wios		Number of write IOs
-	  ======	===================
+	  dbytes	Bytes discarded
+	  dios		Number of discard IOs
+	  ======	=====================
 
 	An example read output follows:
 
-	  8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
-	  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+	  8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
+	  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
 
   io.weight
 	A read-write flat-keyed file which exists on non-root cgroups.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7e2c19ce1a08..1942357d7165 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -567,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		[BLKG_RWSTAT_WRITE]	= "Write",
 		[BLKG_RWSTAT_SYNC]	= "Sync",
 		[BLKG_RWSTAT_ASYNC]	= "Async",
+		[BLKG_RWSTAT_DISCARD]	= "Discard",
 	};
 	const char *dname = blkg_dev_name(pd->blkg);
 	u64 v;
@@ -580,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 			   (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
 	v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
+		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
+		atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 	return v;
 }
@@ -959,7 +961,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		const char *dname;
 		char *buf;
 		struct blkg_rwstat rwstat;
-		u64 rbytes, wbytes, rios, wios;
+		u64 rbytes, wbytes, rios, wios, dbytes, dios;
 		size_t size = seq_get_buf(sf, &buf), off = 0;
 		int i;
 		bool has_stats = false;
@@ -982,19 +984,22 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 					offsetof(struct blkcg_gq, stat_bytes));
 		rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 		wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+		dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 
 		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
 					offsetof(struct blkcg_gq, stat_ios));
 		rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 
 		spin_unlock_irq(blkg->q->queue_lock);
 
 		if (rbytes || wbytes || rios || wios) {
 			has_stats = true;
 			off += scnprintf(buf+off, size-off,
-					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu",
-					 rbytes, wbytes, rios, wios);
+					 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+					 rbytes, wbytes, rios, wios,
+					 dbytes, dios);
 		}
 
 		if (!blkcg_debug_stats)
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index de57de4831d5..3bed5e02a873 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -35,6 +35,7 @@ enum blkg_rwstat_type {
 	BLKG_RWSTAT_WRITE,
 	BLKG_RWSTAT_SYNC,
 	BLKG_RWSTAT_ASYNC,
+	BLKG_RWSTAT_DISCARD,
 
 	BLKG_RWSTAT_NR,
 	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
@@ -649,7 +650,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 {
 	struct percpu_counter *cnt;
 
-	if (op_is_write(op))
+	if (op_is_discard(op))
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+	else if (op_is_write(op))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
-- 
cgit v1.2.3


From bcf7eac3d97f49d8400ba52c71bee5934bf20093 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 17 Jul 2018 00:47:48 +0900
Subject: regmap: add SCCB support

This adds Serial Camera Control Bus (SCCB) support for regmap API that
is intended to be used by some of Omnivision sensor drivers.

The ov772x and ov9650 drivers are going to use this SCCB regmap API.

The ov772x driver was previously only worked with the i2c controller
drivers that support I2C_FUNC_PROTOCOL_MANGLING, because the ov772x
device doesn't support repeated starts.  After commit 0b964d183cbf
("media: ov772x: allow i2c controllers without
I2C_FUNC_PROTOCOL_MANGLING"), reading ov772x register is replaced with
issuing two separated i2c messages in order to avoid repeated start.
Using this SCCB regmap hides the implementation detail.

The ov9650 driver also issues two separated i2c messages to read the
registers as the device doesn't support repeated start.  So it can
make use of this SCCB regmap.

Cc: Mark Brown <broonie@kernel.org>
Cc: Peter Rosin <peda@axentia.se>
Cc: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Cc: Wolfram Sang <wsa@the-dreams.de>
Cc: Sylwester Nawrocki <s.nawrocki@samsung.com>
Cc: Jacopo Mondi <jacopo+renesas@jmondi.org>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/Kconfig       |   4 ++
 drivers/base/regmap/Makefile      |   1 +
 drivers/base/regmap/regmap-sccb.c | 128 ++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h            |  35 +++++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 drivers/base/regmap/regmap-sccb.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index aff34c0c2a3e..6ad5ef48b61e 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -45,3 +45,7 @@ config REGMAP_IRQ
 config REGMAP_SOUNDWIRE
 	tristate
 	depends on SOUNDWIRE_BUS
+
+config REGMAP_SCCB
+	tristate
+	depends on I2C
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 5ed0023fabda..f5b4e8851d00 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_REGMAP_MMIO) += regmap-mmio.o
 obj-$(CONFIG_REGMAP_IRQ) += regmap-irq.o
 obj-$(CONFIG_REGMAP_W1) += regmap-w1.o
 obj-$(CONFIG_REGMAP_SOUNDWIRE) += regmap-sdw.o
+obj-$(CONFIG_REGMAP_SCCB) += regmap-sccb.o
diff --git a/drivers/base/regmap/regmap-sccb.c b/drivers/base/regmap/regmap-sccb.c
new file mode 100644
index 000000000000..b6eb876ea819
--- /dev/null
+++ b/drivers/base/regmap/regmap-sccb.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+// Register map access API - SCCB support
+
+#include <linux/regmap.h>
+#include <linux/i2c.h>
+#include <linux/module.h>
+
+#include "internal.h"
+
+/**
+ * sccb_is_available - Check if the adapter supports SCCB protocol
+ * @adap: I2C adapter
+ *
+ * Return true if the I2C adapter is capable of using SCCB helper functions,
+ * false otherwise.
+ */
+static bool sccb_is_available(struct i2c_adapter *adap)
+{
+	u32 needed_funcs = I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_WRITE_BYTE_DATA;
+
+	/*
+	 * If we ever want support for hardware doing SCCB natively, we will
+	 * introduce a sccb_xfer() callback to struct i2c_algorithm and check
+	 * for it here.
+	 */
+
+	return (i2c_get_functionality(adap) & needed_funcs) == needed_funcs;
+}
+
+/**
+ * regmap_sccb_read - Read data from SCCB slave device
+ * @context: Device that will be interacted wit
+ * @reg: Register to be read from
+ * @val: Pointer to store read value
+ *
+ * This executes the 2-phase write transmission cycle that is followed by a
+ * 2-phase read transmission cycle, returning negative errno else zero on
+ * success.
+ */
+static int regmap_sccb_read(void *context, unsigned int reg, unsigned int *val)
+{
+	struct device *dev = context;
+	struct i2c_client *i2c = to_i2c_client(dev);
+	int ret;
+	union i2c_smbus_data data;
+
+	i2c_lock_bus(i2c->adapter, I2C_LOCK_SEGMENT);
+
+	ret = __i2c_smbus_xfer(i2c->adapter, i2c->addr, i2c->flags,
+			       I2C_SMBUS_WRITE, reg, I2C_SMBUS_BYTE, NULL);
+	if (ret < 0)
+		goto out;
+
+	ret = __i2c_smbus_xfer(i2c->adapter, i2c->addr, i2c->flags,
+			       I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &data);
+	if (ret < 0)
+		goto out;
+
+	*val = data.byte;
+out:
+	i2c_unlock_bus(i2c->adapter, I2C_LOCK_SEGMENT);
+
+	return ret;
+}
+
+/**
+ * regmap_sccb_write - Write data to SCCB slave device
+ * @context: Device that will be interacted wit
+ * @reg: Register to write to
+ * @val: Value to be written
+ *
+ * This executes the SCCB 3-phase write transmission cycle, returning negative
+ * errno else zero on success.
+ */
+static int regmap_sccb_write(void *context, unsigned int reg, unsigned int val)
+{
+	struct device *dev = context;
+	struct i2c_client *i2c = to_i2c_client(dev);
+
+	return i2c_smbus_write_byte_data(i2c, reg, val);
+}
+
+static struct regmap_bus regmap_sccb_bus = {
+	.reg_write = regmap_sccb_write,
+	.reg_read = regmap_sccb_read,
+};
+
+static const struct regmap_bus *regmap_get_sccb_bus(struct i2c_client *i2c,
+					const struct regmap_config *config)
+{
+	if (config->val_bits == 8 && config->reg_bits == 8 &&
+			sccb_is_available(i2c->adapter))
+		return &regmap_sccb_bus;
+
+	return ERR_PTR(-ENOTSUPP);
+}
+
+struct regmap *__regmap_init_sccb(struct i2c_client *i2c,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name)
+{
+	const struct regmap_bus *bus = regmap_get_sccb_bus(i2c, config);
+
+	if (IS_ERR(bus))
+		return ERR_CAST(bus);
+
+	return __regmap_init(&i2c->dev, bus, &i2c->dev, config,
+			     lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__regmap_init_sccb);
+
+struct regmap *__devm_regmap_init_sccb(struct i2c_client *i2c,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name)
+{
+	const struct regmap_bus *bus = regmap_get_sccb_bus(i2c, config);
+
+	if (IS_ERR(bus))
+		return ERR_CAST(bus);
+
+	return __devm_regmap_init(&i2c->dev, bus, &i2c->dev, config,
+				  lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__devm_regmap_init_sccb);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4f38068ffb71..52bf358e2c73 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -514,6 +514,10 @@ struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
+struct regmap *__regmap_init_sccb(struct i2c_client *i2c,
+				  const struct regmap_config *config,
+				  struct lock_class_key *lock_key,
+				  const char *lock_name);
 struct regmap *__regmap_init_slimbus(struct slim_device *slimbus,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
@@ -558,6 +562,10 @@ struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
 				      const char *lock_name);
+struct regmap *__devm_regmap_init_sccb(struct i2c_client *i2c,
+				       const struct regmap_config *config,
+				       struct lock_class_key *lock_key,
+				       const char *lock_name);
 struct regmap *__devm_regmap_init_spi(struct spi_device *dev,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
@@ -645,6 +653,19 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 	__regmap_lockdep_wrapper(__regmap_init_i2c, #config,		\
 				i2c, config)
 
+/**
+ * regmap_init_sccb() - Initialise register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_sccb(i2c, config)					\
+	__regmap_lockdep_wrapper(__regmap_init_sccb, #config,		\
+				i2c, config)
+
 /**
  * regmap_init_slimbus() - Initialise register map
  *
@@ -797,6 +818,20 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__devm_regmap_init_i2c, #config,	\
 				i2c, config)
 
+/**
+ * devm_regmap_init_sccb() - Initialise managed register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_sccb(i2c, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_sccb, #config,	\
+				i2c, config)
+
 /**
  * devm_regmap_init_spi() - Initialise register map
  *
-- 
cgit v1.2.3


From 7c201ead1bfd19935f689fca69100c335028c047 Mon Sep 17 00:00:00 2001
From: David Lechner <david@lechnology.com>
Date: Mon, 16 Jul 2018 22:20:49 -0500
Subject: spi: spi-bitbang: change flags from u8 to u16

This changes the data type of the flags field in struct spi_bitbang from
u8 to u16. This matches the size of the mode field of struct spi_device
where these flags are also used.

This is done in preparation of adding a new SPI mode flag that will be
used with this field that would otherwise not fit in 8 bits.

Signed-off-by: David Lechner <david@lechnology.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi_bitbang.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index 51d8c060e513..c1e61641a7f1 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -8,7 +8,7 @@ struct spi_bitbang {
 	struct mutex		lock;
 	u8			busy;
 	u8			use_dma;
-	u8			flags;		/* extra spi->mode support */
+	u16			flags;		/* extra spi->mode support */
 
 	struct spi_master	*master;
 
-- 
cgit v1.2.3


From a48d189ef53146a8df132a327a637c4182f50a16 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 17 Jul 2018 11:52:57 +0200
Subject: net: Move skb decrypted field, avoid explicity copy

Commit 784abe24c903 ("net: Add decrypted field to skb")
introduced a 'decrypted' field that is explicitly copied on skb
copy and clone.

Move it between headers_start[0] and headers_end[0], so that we
don't need to copy it explicitly as it's copied by the memcpy()
in __copy_skb_header().

While at it, drop the assignment in __skb_clone(), it was
already redundant.

This doesn't change the size of sk_buff or cacheline boundaries.

The 15-bits hole before tc_index becomes a 14-bits hole, and
will be again a 15-bits hole when this change is merged with
commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in
__copy_skb_header()").

v2: as reported by kbuild test robot (oops, I forgot to build
    with CONFIG_TLS_DEVICE it seems), we can't use
    CHECK_SKB_FIELD() on a bit-field member. Just drop the
    check for the moment being, perhaps we could think of some
    magic to also check bit-field members one day.

Fixes: 784abe24c903 ("net: Add decrypted field to skb")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 9 ++++-----
 net/core/skbuff.c      | 6 ------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ceb8dcc54da..14bc9ebe30f2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -630,7 +630,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
- *	@decrypted: Decrypted SKB
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -641,6 +640,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
  *	@csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
  *	@dst_pending_confirm: need to confirm neighbour
+ *	@decrypted: Decrypted SKB
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
@@ -737,11 +737,7 @@ struct sk_buff {
 				peeked:1,
 				head_frag:1,
 				xmit_more:1,
-#ifdef CONFIG_TLS_DEVICE
-				decrypted:1;
-#else
 				__unused:1;
-#endif
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
@@ -797,6 +793,9 @@ struct sk_buff {
 	__u8			tc_redirected:1;
 	__u8			tc_from_ingress:1;
 #endif
+#ifdef CONFIG_TLS_DEVICE
+	__u8			decrypted:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cfd6c6f35f9c..c4e24ac27464 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -805,9 +805,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	 * It is not yet because we do not want to have a 16 bit hole
 	 */
 	new->queue_mapping = old->queue_mapping;
-#ifdef CONFIG_TLS_DEVICE
-	new->decrypted = old->decrypted;
-#endif
 
 	memcpy(&new->headers_start, &old->headers_start,
 	       offsetof(struct sk_buff, headers_end) -
@@ -868,9 +865,6 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(head_frag);
 	C(data);
 	C(truesize);
-#ifdef CONFIG_TLS_DEVICE
-	C(decrypted);
-#endif
 	refcount_set(&n->users, 1);
 
 	atomic_inc(&(skb_shinfo(skb)->dataref));
-- 
cgit v1.2.3


From 8963106eabdc56911e9b65258eb5e9a6b7b3dfda Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Thu, 19 Jul 2018 10:32:12 +0200
Subject: PCI: endpoint: Add MSI-X interfaces

Add PCI_EPC_IRQ_MSIX type.

Add MSI-X callbacks signatures to the ops structure.

Add sysfs interface for set/get MSI-X capability maximum number.

Update documentation accordingly.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 .../PCI/endpoint/function/binding/pci-test.txt     |  2 +
 drivers/pci/endpoint/pci-ep-cfs.c                  | 24 +++++++++
 drivers/pci/endpoint/pci-epc-core.c                | 57 ++++++++++++++++++++++
 include/linux/pci-epc.h                            |  9 ++++
 include/linux/pci-epf.h                            |  1 +
 5 files changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/PCI/endpoint/function/binding/pci-test.txt b/Documentation/PCI/endpoint/function/binding/pci-test.txt
index 3b68b955fb50..cd76ba47394b 100644
--- a/Documentation/PCI/endpoint/function/binding/pci-test.txt
+++ b/Documentation/PCI/endpoint/function/binding/pci-test.txt
@@ -15,3 +15,5 @@ subsys_id	 : don't care
 interrupt_pin	 : Should be 1 - INTA, 2 - INTB, 3 - INTC, 4 -INTD
 msi_interrupts	 : Should be 1 to 32 depending on the number of MSI interrupts
 		   to test
+msix_interrupts	 : Should be 1 to 2048 depending on the number of MSI-X
+		   interrupts to test
diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 018ea3433cb5..d1288a0bd530 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -286,6 +286,28 @@ static ssize_t pci_epf_msi_interrupts_show(struct config_item *item,
 		       to_pci_epf_group(item)->epf->msi_interrupts);
 }
 
+static ssize_t pci_epf_msix_interrupts_store(struct config_item *item,
+					     const char *page, size_t len)
+{
+	u16 val;
+	int ret;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret)
+		return ret;
+
+	to_pci_epf_group(item)->epf->msix_interrupts = val;
+
+	return len;
+}
+
+static ssize_t pci_epf_msix_interrupts_show(struct config_item *item,
+					    char *page)
+{
+	return sprintf(page, "%d\n",
+		       to_pci_epf_group(item)->epf->msix_interrupts);
+}
+
 PCI_EPF_HEADER_R(vendorid)
 PCI_EPF_HEADER_W_u16(vendorid)
 
@@ -327,6 +349,7 @@ CONFIGFS_ATTR(pci_epf_, subsys_vendor_id);
 CONFIGFS_ATTR(pci_epf_, subsys_id);
 CONFIGFS_ATTR(pci_epf_, interrupt_pin);
 CONFIGFS_ATTR(pci_epf_, msi_interrupts);
+CONFIGFS_ATTR(pci_epf_, msix_interrupts);
 
 static struct configfs_attribute *pci_epf_attrs[] = {
 	&pci_epf_attr_vendorid,
@@ -340,6 +363,7 @@ static struct configfs_attribute *pci_epf_attrs[] = {
 	&pci_epf_attr_subsys_id,
 	&pci_epf_attr_interrupt_pin,
 	&pci_epf_attr_msi_interrupts,
+	&pci_epf_attr_msix_interrupts,
 	NULL,
 };
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index b0ee42739c3c..7d77bd0e5d4a 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -217,6 +217,63 @@ int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts)
 }
 EXPORT_SYMBOL_GPL(pci_epc_set_msi);
 
+/**
+ * pci_epc_get_msix() - get the number of MSI-X interrupt numbers allocated
+ * @epc: the EPC device to which MSI-X interrupts was requested
+ * @func_no: the endpoint function number in the EPC device
+ *
+ * Invoke to get the number of MSI-X interrupts allocated by the RC
+ */
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no)
+{
+	int interrupt;
+	unsigned long flags;
+
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions)
+		return 0;
+
+	if (!epc->ops->get_msix)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	interrupt = epc->ops->get_msix(epc, func_no);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	if (interrupt < 0)
+		return 0;
+
+	return interrupt + 1;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_msix);
+
+/**
+ * pci_epc_set_msix() - set the number of MSI-X interrupt numbers required
+ * @epc: the EPC device on which MSI-X has to be configured
+ * @func_no: the endpoint function number in the EPC device
+ * @interrupts: number of MSI-X interrupts required by the EPF
+ *
+ * Invoke to set the required number of MSI-X interrupts.
+ */
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR_OR_NULL(epc) || func_no >= epc->max_functions ||
+	    interrupts < 1 || interrupts > 2048)
+		return -EINVAL;
+
+	if (!epc->ops->set_msix)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->set_msix(epc, func_no, interrupts - 1);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_set_msix);
+
 /**
  * pci_epc_unmap_addr() - unmap CPU address from PCI address
  * @epc: the EPC device on which address is allocated
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 243eaa5a66ff..89f079f582df 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -17,6 +17,7 @@ enum pci_epc_irq_type {
 	PCI_EPC_IRQ_UNKNOWN,
 	PCI_EPC_IRQ_LEGACY,
 	PCI_EPC_IRQ_MSI,
+	PCI_EPC_IRQ_MSIX,
 };
 
 /**
@@ -30,6 +31,10 @@ enum pci_epc_irq_type {
  *	     capability register
  * @get_msi: ops to get the number of MSI interrupts allocated by the RC from
  *	     the MSI capability register
+ * @set_msix: ops to set the requested number of MSI-X interrupts in the
+ *	     MSI-X capability register
+ * @get_msix: ops to get the number of MSI-X interrupts allocated by the RC
+ *	     from the MSI-X capability register
  * @raise_irq: ops to raise a legacy or MSI interrupt
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
@@ -48,6 +53,8 @@ struct pci_epc_ops {
 			      phys_addr_t addr);
 	int	(*set_msi)(struct pci_epc *epc, u8 func_no, u8 interrupts);
 	int	(*get_msi)(struct pci_epc *epc, u8 func_no);
+	int	(*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts);
+	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
 			     enum pci_epc_irq_type type, u8 interrupt_num);
 	int	(*start)(struct pci_epc *epc);
@@ -144,6 +151,8 @@ void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no,
 			phys_addr_t phys_addr);
 int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts);
 int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts);
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 		      enum pci_epc_irq_type type, u8 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 4e7764935fa8..ec02f58758c8 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -119,6 +119,7 @@ struct pci_epf {
 	struct pci_epf_header	*header;
 	struct pci_epf_bar	bar[6];
 	u8			msi_interrupts;
+	u16			msix_interrupts;
 	u8			func_no;
 
 	struct pci_epc		*epc;
-- 
cgit v1.2.3


From d3c70a98d7d63cae02d50ebfafea04264a767401 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Thu, 19 Jul 2018 10:32:13 +0200
Subject: PCI: Update xxx_pcie_ep_raise_irq() and pci_epc_raise_irq()
 signatures

Change {cdns, dra7xx, artpec6, dw, rockchip}_pcie_ep_raise_irq() and
pci_epc_raise_irq() signature, namely the interrupt_num variable type
from u8 to u16 to accommodate 2048 maximum MSI-X interrupts.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Alan Douglas <adouglas@cadence.com>
Acked-by: Shawn Lin <shawn.lin@rock-chips.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Joao Pinto <jpinto@synopsys.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/pci/controller/dwc/pci-dra7xx.c           | 2 +-
 drivers/pci/controller/dwc/pcie-artpec6.c         | 2 +-
 drivers/pci/controller/dwc/pcie-designware-ep.c   | 2 +-
 drivers/pci/controller/dwc/pcie-designware-plat.c | 2 +-
 drivers/pci/controller/dwc/pcie-designware.h      | 2 +-
 drivers/pci/controller/pcie-cadence-ep.c          | 3 ++-
 drivers/pci/controller/pcie-rockchip-ep.c         | 2 +-
 drivers/pci/endpoint/pci-epc-core.c               | 8 ++++----
 include/linux/pci-epc.h                           | 6 +++---
 9 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c
index 345aab56ce8b..ce9224a36f62 100644
--- a/drivers/pci/controller/dwc/pci-dra7xx.c
+++ b/drivers/pci/controller/dwc/pci-dra7xx.c
@@ -370,7 +370,7 @@ static void dra7xx_pcie_raise_msi_irq(struct dra7xx_pcie *dra7xx,
 }
 
 static int dra7xx_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				 enum pci_epc_irq_type type, u8 interrupt_num)
+				 enum pci_epc_irq_type type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pci);
diff --git a/drivers/pci/controller/dwc/pcie-artpec6.c b/drivers/pci/controller/dwc/pcie-artpec6.c
index 128b182648b3..dba83abfe764 100644
--- a/drivers/pci/controller/dwc/pcie-artpec6.c
+++ b/drivers/pci/controller/dwc/pcie-artpec6.c
@@ -427,7 +427,7 @@ static void artpec6_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int artpec6_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				  enum pci_epc_irq_type type, u8 interrupt_num)
+				  enum pci_epc_irq_type type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 04092a7aba89..69d039de2af6 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -242,7 +242,7 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 func_no, u8 encode_int)
 }
 
 static int dw_pcie_ep_raise_irq(struct pci_epc *epc, u8 func_no,
-				enum pci_epc_irq_type type, u8 interrupt_num)
+				enum pci_epc_irq_type type, u16 interrupt_num)
 {
 	struct dw_pcie_ep *ep = epc_get_drvdata(epc);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index a37dc92a03c7..cca69ac63319 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -81,7 +81,7 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 
 static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
 				     enum pci_epc_irq_type type,
-				     u8 interrupt_num)
+				     u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index bee4e2535a61..9d581c077329 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -191,7 +191,7 @@ enum dw_pcie_as_type {
 struct dw_pcie_ep_ops {
 	void	(*ep_init)(struct dw_pcie_ep *ep);
 	int	(*raise_irq)(struct dw_pcie_ep *ep, u8 func_no,
-			     enum pci_epc_irq_type type, u8 interrupt_num);
+			     enum pci_epc_irq_type type, u16 interrupt_num);
 };
 
 struct dw_pcie_ep {
diff --git a/drivers/pci/controller/pcie-cadence-ep.c b/drivers/pci/controller/pcie-cadence-ep.c
index e3fe4124e3af..208d11f27d5d 100644
--- a/drivers/pci/controller/pcie-cadence-ep.c
+++ b/drivers/pci/controller/pcie-cadence-ep.c
@@ -363,7 +363,8 @@ static int cdns_pcie_ep_send_msi_irq(struct cdns_pcie_ep *ep, u8 fn,
 }
 
 static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn,
-				  enum pci_epc_irq_type type, u8 interrupt_num)
+				  enum pci_epc_irq_type type,
+				  u16 interrupt_num)
 {
 	struct cdns_pcie_ep *ep = epc_get_drvdata(epc);
 
diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c
index 6beba8ed7b84..b8163c56a142 100644
--- a/drivers/pci/controller/pcie-rockchip-ep.c
+++ b/drivers/pci/controller/pcie-rockchip-ep.c
@@ -472,7 +472,7 @@ static int rockchip_pcie_ep_send_msi_irq(struct rockchip_pcie_ep *ep, u8 fn,
 
 static int rockchip_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn,
 				      enum pci_epc_irq_type type,
-				      u8 interrupt_num)
+				      u16 interrupt_num)
 {
 	struct rockchip_pcie_ep *ep = epc_get_drvdata(epc);
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 7d77bd0e5d4a..c72e656e7e29 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -131,13 +131,13 @@ EXPORT_SYMBOL_GPL(pci_epc_start);
  * pci_epc_raise_irq() - interrupt the host system
  * @epc: the EPC device which has to interrupt the host
  * @func_no: the endpoint function number in the EPC device
- * @type: specify the type of interrupt; legacy or MSI
- * @interrupt_num: the MSI interrupt number
+ * @type: specify the type of interrupt; legacy, MSI or MSI-X
+ * @interrupt_num: the MSI or MSI-X interrupt number
  *
- * Invoke to raise an MSI or legacy interrupt
+ * Invoke to raise an legacy, MSI or MSI-X interrupt
  */
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
-		      enum pci_epc_irq_type type, u8 interrupt_num)
+		      enum pci_epc_irq_type type, u16 interrupt_num)
 {
 	int ret;
 	unsigned long flags;
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 89f079f582df..bb2395b56f13 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -35,7 +35,7 @@ enum pci_epc_irq_type {
  *	     MSI-X capability register
  * @get_msix: ops to get the number of MSI-X interrupts allocated by the RC
  *	     from the MSI-X capability register
- * @raise_irq: ops to raise a legacy or MSI interrupt
+ * @raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
  * @owner: the module owner containing the ops
@@ -56,7 +56,7 @@ struct pci_epc_ops {
 	int	(*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts);
 	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
-			     enum pci_epc_irq_type type, u8 interrupt_num);
+			     enum pci_epc_irq_type type, u16 interrupt_num);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
 	struct module *owner;
@@ -154,7 +154,7 @@ int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
 int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts);
 int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
-		      enum pci_epc_irq_type type, u8 interrupt_num);
+		      enum pci_epc_irq_type type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 struct pci_epc *pci_epc_get(const char *epc_name);
-- 
cgit v1.2.3


From c2e00e31087e58f6c49b90b4702fc3df4fad6a83 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Date: Thu, 19 Jul 2018 10:32:19 +0200
Subject: pci-epf-test/pci_endpoint_test: Add MSI-X support

Add MSI-X support and update driver documentation accordingly.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 Documentation/PCI/endpoint/pci-endpoint.txt       |  4 ++--
 Documentation/PCI/endpoint/pci-test-function.txt  |  4 +++-
 Documentation/PCI/endpoint/pci-test-howto.txt     | 22 ++++++++++++++---
 Documentation/ioctl/ioctl-number.txt              |  1 +
 Documentation/misc-devices/pci-endpoint-test.txt  |  3 +++
 drivers/misc/pci_endpoint_test.c                  | 29 ++++++++++++++++-------
 drivers/pci/controller/dwc/pcie-designware-plat.c |  1 +
 drivers/pci/endpoint/functions/pci-epf-test.c     | 29 +++++++++++++++++++++--
 include/linux/pci-epc.h                           |  1 +
 include/uapi/linux/pcitest.h                      |  1 +
 10 files changed, 79 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt
index 9b1d66829290..e86a96b66a6a 100644
--- a/Documentation/PCI/endpoint/pci-endpoint.txt
+++ b/Documentation/PCI/endpoint/pci-endpoint.txt
@@ -44,7 +44,7 @@ by the PCI controller driver.
 	 * clear_bar: ops to reset the BAR
 	 * alloc_addr_space: ops to allocate in PCI controller address space
 	 * free_addr_space: ops to free the allocated address space
-	 * raise_irq: ops to raise a legacy or MSI interrupt
+	 * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
 	 * start: ops to start the PCI link
 	 * stop: ops to stop the PCI link
 
@@ -96,7 +96,7 @@ by the PCI endpoint function driver.
 *) pci_epc_raise_irq()
 
    The PCI endpoint function driver should use pci_epc_raise_irq() to raise
-   Legacy Interrupt or MSI Interrupt.
+   Legacy Interrupt, MSI or MSI-X Interrupt.
 
 *) pci_epc_mem_alloc_addr()
 
diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt
index bf4b5cf6fee6..5916f1f592bb 100644
--- a/Documentation/PCI/endpoint/pci-test-function.txt
+++ b/Documentation/PCI/endpoint/pci-test-function.txt
@@ -36,7 +36,7 @@ that the endpoint device must perform.
 Bitfield Description:
   Bit 0		: raise legacy IRQ
   Bit 1		: raise MSI IRQ
-  Bit 2		: raise MSI-X IRQ (reserved for future implementation)
+  Bit 2		: raise MSI-X IRQ
   Bit 3		: read command (read data from RC buffer)
   Bit 4		: write command (write data to RC buffer)
   Bit 5		: copy command (copy data from one RC buffer to another
@@ -75,6 +75,7 @@ for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
 Possible types:
  - Legacy	: 0
  - MSI		: 1
+ - MSI-X	: 2
 
 *) PCI_ENDPOINT_TEST_IRQ_NUMBER
 
@@ -83,3 +84,4 @@ This register contains the triggered ID interrupt.
 Admissible values:
  - Legacy	: 0
  - MSI		: [1 .. 32]
+ - MSI-X	: [1 .. 2048]
diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt
index 75f48c3bb191..65f1a137e35c 100644
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ b/Documentation/PCI/endpoint/pci-test-howto.txt
@@ -45,9 +45,9 @@ The PCI endpoint framework populates the directory with the following
 configurable fields.
 
 	# ls functions/pci_epf_test/func1
-	  baseclass_code	interrupt_pin	revid		subsys_vendor_id
-	  cache_line_size	msi_interrupts	subclass_code	vendorid
-	  deviceid          	progif_code	subsys_id
+	  baseclass_code	interrupt_pin	progif_code	subsys_id
+	  cache_line_size	msi_interrupts	revid		subsys_vendorid
+	  deviceid          	msix_interrupts	subclass_code	vendorid
 
 The PCI endpoint function driver populates these entries with default values
 when the device is bound to the driver. The pci-epf-test driver populates
@@ -67,6 +67,7 @@ device, the following commands can be used.
 	# echo 0x104c > functions/pci_epf_test/func1/vendorid
 	# echo 0xb500 > functions/pci_epf_test/func1/deviceid
 	# echo 16 > functions/pci_epf_test/func1/msi_interrupts
+	# echo 8 > functions/pci_epf_test/func1/msix_interrupts
 
 1.5 Binding pci-epf-test Device to EP Controller
 
@@ -153,6 +154,21 @@ following commands.
 	MSI30:          NOT OKAY
 	MSI31:          NOT OKAY
 	MSI32:          NOT OKAY
+	MSIX1:          OKAY
+	MSIX2:          OKAY
+	MSIX3:          OKAY
+	MSIX4:          OKAY
+	MSIX5:          OKAY
+	MSIX6:          OKAY
+	MSIX7:          OKAY
+	MSIX8:          OKAY
+	MSIX9:          NOT OKAY
+	MSIX10:         NOT OKAY
+	MSIX11:         NOT OKAY
+	MSIX12:         NOT OKAY
+	MSIX13:         NOT OKAY
+	[...]
+	MSIX2048:       NOT OKAY
 
 	Read Tests
 
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 480c8609dc58..65259d459fd1 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -166,6 +166,7 @@ Code  Seq#(hex)	Include File		Comments
 'P'	all	linux/soundcard.h	conflict!
 'P'	60-6F	sound/sscape_ioctl.h	conflict!
 'P'	00-0F	drivers/usb/class/usblp.c	conflict!
+'P'	01-07	drivers/misc/pci_endpoint_test.c	conflict!
 'Q'	all	linux/soundcard.h
 'R'	00-1F	linux/random.h		conflict!
 'R'	01	linux/rfkill.h		conflict!
diff --git a/Documentation/misc-devices/pci-endpoint-test.txt b/Documentation/misc-devices/pci-endpoint-test.txt
index 4ebc3594b32c..fdfa0f66d3d0 100644
--- a/Documentation/misc-devices/pci-endpoint-test.txt
+++ b/Documentation/misc-devices/pci-endpoint-test.txt
@@ -10,6 +10,7 @@ The PCI driver for the test device performs the following tests
 	*) verifying addresses programmed in BAR
 	*) raise legacy IRQ
 	*) raise MSI IRQ
+	*) raise MSI-X IRQ
 	*) read data
 	*) write data
 	*) copy data
@@ -25,6 +26,8 @@ ioctl
  PCITEST_LEGACY_IRQ: Tests legacy IRQ
  PCITEST_MSI: Tests message signalled interrupts. The MSI number
 	      to be tested should be passed as argument.
+ PCITEST_MSIX: Tests message signalled interrupts. The MSI-X number
+	      to be tested should be passed as argument.
  PCITEST_WRITE: Perform write tests. The size of the buffer should be passed
 		as argument.
  PCITEST_READ: Perform read tests. The size of the buffer should be passed
diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 349794cbe1f3..f4fef108caff 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -39,13 +39,14 @@
 
 #define IRQ_TYPE_LEGACY				0
 #define IRQ_TYPE_MSI				1
+#define IRQ_TYPE_MSIX				2
 
 #define PCI_ENDPOINT_TEST_MAGIC			0x0
 
 #define PCI_ENDPOINT_TEST_COMMAND		0x4
 #define COMMAND_RAISE_LEGACY_IRQ		BIT(0)
 #define COMMAND_RAISE_MSI_IRQ			BIT(1)
-/* BIT(2) is reserved for raising MSI-X IRQ command */
+#define COMMAND_RAISE_MSIX_IRQ			BIT(2)
 #define COMMAND_READ				BIT(3)
 #define COMMAND_WRITE				BIT(4)
 #define COMMAND_COPY				BIT(5)
@@ -84,7 +85,7 @@ MODULE_PARM_DESC(no_msi, "Disable MSI interrupt in pci_endpoint_test");
 
 static int irq_type = IRQ_TYPE_MSI;
 module_param(irq_type, int, 0444);
-MODULE_PARM_DESC(irq_type, "IRQ mode selection in pci_endpoint_test (0 - Legacy, 1 - MSI)");
+MODULE_PARM_DESC(irq_type, "IRQ mode selection in pci_endpoint_test (0 - Legacy, 1 - MSI, 2 - MSI-X)");
 
 enum pci_barno {
 	BAR_0,
@@ -202,16 +203,18 @@ static bool pci_endpoint_test_legacy_irq(struct pci_endpoint_test *test)
 }
 
 static bool pci_endpoint_test_msi_irq(struct pci_endpoint_test *test,
-				      u8 msi_num)
+				       u16 msi_num, bool msix)
 {
 	u32 val;
 	struct pci_dev *pdev = test->pdev;
 
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE,
-				 IRQ_TYPE_MSI);
+				 msix == false ? IRQ_TYPE_MSI :
+				 IRQ_TYPE_MSIX);
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, msi_num);
 	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
-				 COMMAND_RAISE_MSI_IRQ);
+				 msix == false ? COMMAND_RAISE_MSI_IRQ :
+				 COMMAND_RAISE_MSIX_IRQ);
 	val = wait_for_completion_timeout(&test->irq_raised,
 					  msecs_to_jiffies(1000));
 	if (!val)
@@ -456,7 +459,8 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 		ret = pci_endpoint_test_legacy_irq(test);
 		break;
 	case PCITEST_MSI:
-		ret = pci_endpoint_test_msi_irq(test, arg);
+	case PCITEST_MSIX:
+		ret = pci_endpoint_test_msi_irq(test, arg, cmd == PCITEST_MSIX);
 		break;
 	case PCITEST_WRITE:
 		ret = pci_endpoint_test_write(test, arg);
@@ -542,6 +546,12 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 			dev_err(dev, "Failed to get MSI interrupts\n");
 		test->num_irqs = irq;
 		break;
+	case IRQ_TYPE_MSIX:
+		irq = pci_alloc_irq_vectors(pdev, 1, 2048, PCI_IRQ_MSIX);
+		if (irq < 0)
+			dev_err(dev, "Failed to get MSI-X interrupts\n");
+		test->num_irqs = irq;
+		break;
 	default:
 		dev_err(dev, "Invalid IRQ type selected\n");
 	}
@@ -558,8 +568,9 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev,
 				       pci_endpoint_test_irqhandler,
 				       IRQF_SHARED, DRV_MODULE_NAME, test);
 		if (err)
-			dev_err(dev, "failed to request IRQ %d for MSI %d\n",
-				pci_irq_vector(pdev, i), i + 1);
+			dev_err(dev, "Failed to request IRQ %d for MSI%s %d\n",
+				pci_irq_vector(pdev, i),
+				irq_type == IRQ_TYPE_MSIX ? "-X" : "", i + 1);
 	}
 
 	for (bar = BAR_0; bar <= BAR_5; bar++) {
@@ -625,6 +636,7 @@ err_iounmap:
 
 err_disable_msi:
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 
 err_disable_pdev:
@@ -656,6 +668,7 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
 	for (i = 0; i < test->num_irqs; i++)
 		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), test);
 	pci_disable_msi(pdev);
+	pci_disable_msix(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 }
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index 3f8a3aa3a91e..c12bf794d69c 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -77,6 +77,7 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 		dw_pcie_ep_reset_bar(pci, bar);
 
 	epc->features |= EPC_FEATURE_NO_LINKUP_NOTIFIER;
+	epc->features |= EPC_FEATURE_MSIX_AVAILABLE;
 }
 
 static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index db4b23672004..3e86fa3c7da3 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -20,10 +20,11 @@
 
 #define IRQ_TYPE_LEGACY			0
 #define IRQ_TYPE_MSI			1
+#define IRQ_TYPE_MSIX			2
 
 #define COMMAND_RAISE_LEGACY_IRQ	BIT(0)
 #define COMMAND_RAISE_MSI_IRQ		BIT(1)
-/* BIT(2) is reserved for raising MSI-X IRQ command */
+#define COMMAND_RAISE_MSIX_IRQ		BIT(2)
 #define COMMAND_READ			BIT(3)
 #define COMMAND_WRITE			BIT(4)
 #define COMMAND_COPY			BIT(5)
@@ -47,6 +48,7 @@ struct pci_epf_test {
 	struct pci_epf		*epf;
 	enum pci_barno		test_reg_bar;
 	bool			linkup_notifier;
+	bool			msix_available;
 	struct delayed_work	cmd_handler;
 };
 
@@ -266,6 +268,9 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq_type,
 	case IRQ_TYPE_MSI:
 		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSI, irq);
 		break;
+	case IRQ_TYPE_MSIX:
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX, irq);
+		break;
 	default:
 		dev_err(dev, "Failed to raise IRQ, unknown type\n");
 		break;
@@ -292,7 +297,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 	reg->command = 0;
 	reg->status = 0;
 
-	if (reg->irq_type > IRQ_TYPE_MSI) {
+	if (reg->irq_type > IRQ_TYPE_MSIX) {
 		dev_err(dev, "Failed to detect IRQ type\n");
 		goto reset_handler;
 	}
@@ -346,6 +351,16 @@ static void pci_epf_test_cmd_handler(struct work_struct *work)
 		goto reset_handler;
 	}
 
+	if (command & COMMAND_RAISE_MSIX_IRQ) {
+		count = pci_epc_get_msix(epc, epf->func_no);
+		if (reg->irq_number > count || count <= 0)
+			goto reset_handler;
+		reg->status = STATUS_IRQ_RAISED;
+		pci_epc_raise_irq(epc, epf->func_no, PCI_EPC_IRQ_MSIX,
+				  reg->irq_number);
+		goto reset_handler;
+	}
+
 reset_handler:
 	queue_delayed_work(kpcitest_workqueue, &epf_test->cmd_handler,
 			   msecs_to_jiffies(1));
@@ -459,6 +474,8 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 	else
 		epf_test->linkup_notifier = true;
 
+	epf_test->msix_available = epc->features & EPC_FEATURE_MSIX_AVAILABLE;
+
 	epf_test->test_reg_bar = EPC_FEATURE_GET_BAR(epc->features);
 
 	ret = pci_epc_write_header(epc, epf->func_no, header);
@@ -481,6 +498,14 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		return ret;
 	}
 
+	if (epf_test->msix_available) {
+		ret = pci_epc_set_msix(epc, epf->func_no, epf->msix_interrupts);
+		if (ret) {
+			dev_err(dev, "MSI-X configuration failed\n");
+			return ret;
+		}
+	}
+
 	if (!epf_test->linkup_notifier)
 		queue_work(kpcitest_workqueue, &epf_test->cmd_handler.work);
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index bb2395b56f13..37dab8116901 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -102,6 +102,7 @@ struct pci_epc {
 
 #define EPC_FEATURE_NO_LINKUP_NOTIFIER		BIT(0)
 #define EPC_FEATURE_BAR_MASK			(BIT(1) | BIT(2) | BIT(3))
+#define EPC_FEATURE_MSIX_AVAILABLE		BIT(4)
 #define EPC_FEATURE_SET_BAR(features, bar)	\
 		(features |= (EPC_FEATURE_BAR_MASK & (bar << 1)))
 #define EPC_FEATURE_GET_BAR(features)		\
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index 953cf036cb26..d746fb159dcd 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -16,5 +16,6 @@
 #define PCITEST_WRITE		_IOW('P', 0x4, unsigned long)
 #define PCITEST_READ		_IOW('P', 0x5, unsigned long)
 #define PCITEST_COPY		_IOW('P', 0x6, unsigned long)
+#define PCITEST_MSIX		_IOW('P', 0x7, int)
 
 #endif /* __UAPI_LINUX_PCITEST_H */
-- 
cgit v1.2.3


From d70420bcd447e5400a0116b20b9b6088bb32448e Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jul 2018 22:33:02 +0200
Subject: mtd: adapt misleading comment in mtd_oob_ops structure

A comment in the kernel doc of the mtd_oob_ops structure tells that it
is not possible to write more than one page with OOB. This is actually
true for only a few MTD devices like 'onenand' but it is definitely not
a general limitation. While this would benefit to be handled elsewhere
either by the MTD layer or by the limited drivers, let's update this
comment to reflect the reality.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 include/linux/mtd/mtd.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a86c4fa93115..cd0be91bdefa 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -67,9 +67,11 @@ struct mtd_erase_region_info {
  * @datbuf:	data buffer - if NULL only oob data are read/written
  * @oobbuf:	oob data buffer
  *
- * Note, it is allowed to read more than one OOB area at one go, but not write.
- * The interface assumes that the OOB write requests program only one page's
- * OOB area.
+ * Note, some MTD drivers do not allow you to write more than one OOB area at
+ * one go. If you try to do that on such an MTD device, -EINVAL will be
+ * returned. If you want to make your implementation portable on all kind of MTD
+ * devices you should split the write request into several sub-requests when the
+ * request crosses a page boundary.
  */
 struct mtd_oob_ops {
 	unsigned int	mode;
-- 
cgit v1.2.3


From 0d6030ac041f6835974deb88a1a9c299b4adc3ad Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Wed, 18 Jul 2018 10:42:17 +0200
Subject: mtd: rawnand: Expose _notsupp() helpers for raw page accessors

Some implementations simply can't disable their ECC engine. Expose
helpers returning -ENOTSUPP so that the caller knows that raw accesses
are not supported instead of silently falling back to non-raw
accessors.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/mtd/rawnand.h      |  4 ++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 4fa5e20d9690..c4b74630f4c5 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -2966,6 +2966,23 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
 }
 EXPORT_SYMBOL(nand_check_erased_ecc_chunk);
 
+/**
+ * nand_read_page_raw_notsupp - dummy read raw page function
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @buf: buffer to store read data
+ * @oob_required: caller requires OOB data read to chip->oob_poi
+ * @page: page number to read
+ *
+ * Returns -ENOTSUPP unconditionally.
+ */
+int nand_read_page_raw_notsupp(struct mtd_info *mtd, struct nand_chip *chip,
+			       u8 *buf, int oob_required, int page)
+{
+	return -ENOTSUPP;
+}
+EXPORT_SYMBOL(nand_read_page_raw_notsupp);
+
 /**
  * nand_read_page_raw - [INTERN] read raw page data without ecc
  * @mtd: mtd info structure
@@ -3960,6 +3977,22 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from,
 	return ret;
 }
 
+/**
+ * nand_write_page_raw_notsupp - dummy raw page write function
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @buf: data buffer
+ * @oob_required: must write chip->oob_poi to OOB
+ * @page: page number to write
+ *
+ * Returns -ENOTSUPP unconditionally.
+ */
+int nand_write_page_raw_notsupp(struct mtd_info *mtd, struct nand_chip *chip,
+				const u8 *buf, int oob_required, int page)
+{
+	return -ENOTSUPP;
+}
+EXPORT_SYMBOL(nand_write_page_raw_notsupp);
 
 /**
  * nand_write_page_raw - [INTERN] raw page write function
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 11c2426fc363..f60fad29eae6 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1681,10 +1681,14 @@ int nand_get_set_features_notsupp(struct mtd_info *mtd, struct nand_chip *chip,
 /* Default read_page_raw implementation */
 int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
 		       uint8_t *buf, int oob_required, int page);
+int nand_read_page_raw_notsupp(struct mtd_info *mtd, struct nand_chip *chip,
+			       u8 *buf, int oob_required, int page);
 
 /* Default write_page_raw implementation */
 int nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
 			const uint8_t *buf, int oob_required, int page);
+int nand_write_page_raw_notsupp(struct mtd_info *mtd, struct nand_chip *chip,
+				const u8 *buf, int oob_required, int page);
 
 /* Reset and initialize a NAND device */
 int nand_reset(struct nand_chip *chip, int chipnr);
-- 
cgit v1.2.3


From 60ed982a4e78ff938824a750dbac8a10e5b472ef Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Thu, 21 Jun 2018 16:48:26 -0700
Subject: PCI/AER: Move internal declarations to drivers/pci/pci.h

Since pci_aer_init() and pci_no_aer() are used only internally, move their
declarations to the PCI internal header file.  Also, no one cares about
return value of pci_aer_init(), so make it void.

Signed-off-by: Rajat Jain <rajatja@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h      | 8 ++++++++
 drivers/pci/pcie/aer.c | 4 ++--
 include/linux/pci.h    | 4 ----
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4f723442f602..52bc5b350dfb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -480,4 +480,12 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
 }
 #endif
 
+#ifdef CONFIG_PCIEAER
+void pci_no_aer(void);
+void pci_aer_init(struct pci_dev *dev);
+#else
+static inline void pci_no_aer(void) { }
+static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
+#endif
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e6d5255d718c..0c6fe22eaf75 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -382,10 +382,10 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
 	return 0;
 }
 
-int pci_aer_init(struct pci_dev *dev)
+void pci_aer_init(struct pci_dev *dev)
 {
 	dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
-	return pci_cleanup_aer_error_status_regs(dev);
+	pci_cleanup_aer_error_status_regs(dev);
 }
 
 #define AER_AGENT_RECEIVER		0
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..b4ffea05c999 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1468,13 +1468,9 @@ static inline bool pcie_aspm_support_enabled(void) { return false; }
 #endif
 
 #ifdef CONFIG_PCIEAER
-void pci_no_aer(void);
 bool pci_aer_available(void);
-int pci_aer_init(struct pci_dev *dev);
 #else
-static inline void pci_no_aer(void) { }
 static inline bool pci_aer_available(void) { return false; }
-static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
 #endif
 
 #ifdef CONFIG_PCIE_ECRC
-- 
cgit v1.2.3


From db89ccbe52c7885644ba578c7771e57620f879b1 Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Sat, 30 Jun 2018 15:07:17 -0500
Subject: PCI/AER: Define aer_stats structure for AER capable devices

Define a structure to hold the AER statistics.  There are 2 groups of
statistics: dev_* counters that are to be collected for all AER capable
devices and rootport_* counters that are collected for all (AER capable)
rootports only.  Allocate and free this structure when device is added or
released (thus counters survive the lifetime of the device).

Signed-off-by: Rajat Jain <rajatja@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pcie/aer.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/pci/probe.c    |  1 +
 include/linux/pci.h    |  1 +
 4 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 52bc5b350dfb..1877a14e06a9 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -483,9 +483,11 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
 #ifdef CONFIG_PCIEAER
 void pci_no_aer(void);
 void pci_aer_init(struct pci_dev *dev);
+void pci_aer_exit(struct pci_dev *dev);
 #else
 static inline void pci_no_aer(void) { }
 static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
+static inline void pci_aer_exit(struct pci_dev *d) { }
 #endif
 
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 0c6fe22eaf75..fe1b9d22a331 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -32,6 +32,9 @@
 
 #define AER_ERROR_SOURCES_MAX		100
 
+#define AER_MAX_TYPEOF_COR_ERRS		16	/* as per PCI_ERR_COR_STATUS */
+#define AER_MAX_TYPEOF_UNCOR_ERRS	26	/* as per PCI_ERR_UNCOR_STATUS*/
+
 struct aer_err_source {
 	unsigned int status;
 	unsigned int id;
@@ -56,6 +59,42 @@ struct aer_rpc {
 					 */
 };
 
+/* AER stats for the device */
+struct aer_stats {
+
+	/*
+	 * Fields for all AER capable devices. They indicate the errors
+	 * "as seen by this device". Note that this may mean that if an
+	 * end point is causing problems, the AER counters may increment
+	 * at its link partner (e.g. root port) because the errors will be
+	 * "seen" by the link partner and not the the problematic end point
+	 * itself (which may report all counters as 0 as it never saw any
+	 * problems).
+	 */
+	/* Counters for different type of correctable errors */
+	u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS];
+	/* Counters for different type of fatal uncorrectable errors */
+	u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
+	/* Counters for different type of nonfatal uncorrectable errors */
+	u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
+	/* Total number of ERR_COR sent by this device */
+	u64 dev_total_cor_errs;
+	/* Total number of ERR_FATAL sent by this device */
+	u64 dev_total_fatal_errs;
+	/* Total number of ERR_NONFATAL sent by this device */
+	u64 dev_total_nonfatal_errs;
+
+	/*
+	 * Fields for Root ports & root complex event collectors only, these
+	 * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
+	 * messages received by the root port / event collector, INCLUDING the
+	 * ones that are generated internally (by the rootport itself)
+	 */
+	u64 rootport_total_cor_errs;
+	u64 rootport_total_fatal_errs;
+	u64 rootport_total_nonfatal_errs;
+};
+
 #define AER_LOG_TLP_MASKS		(PCI_ERR_UNC_POISON_TLP|	\
 					PCI_ERR_UNC_ECRC|		\
 					PCI_ERR_UNC_UNSUP|		\
@@ -385,9 +424,19 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
 void pci_aer_init(struct pci_dev *dev)
 {
 	dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+
+	if (dev->aer_cap)
+		dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
+
 	pci_cleanup_aer_error_status_regs(dev);
 }
 
+void pci_aer_exit(struct pci_dev *dev)
+{
+	kfree(dev->aer_stats);
+	dev->aer_stats = NULL;
+}
+
 #define AER_AGENT_RECEIVER		0
 #define AER_AGENT_REQUESTER		1
 #define AER_AGENT_COMPLETER		2
@@ -438,7 +487,7 @@ static const char *aer_error_layer[] = {
 	"Transaction Layer"
 };
 
-static const char *aer_correctable_error_string[] = {
+static const char *aer_correctable_error_string[AER_MAX_TYPEOF_COR_ERRS] = {
 	"RxErr",			/* Bit Position 0	*/
 	NULL,
 	NULL,
@@ -457,7 +506,7 @@ static const char *aer_correctable_error_string[] = {
 	"HeaderOF",			/* Bit Position 15	*/
 };
 
-static const char *aer_uncorrectable_error_string[] = {
+static const char *aer_uncorrectable_error_string[AER_MAX_TYPEOF_UNCOR_ERRS] = {
 	"Undefined",			/* Bit Position 0	*/
 	NULL,
 	NULL,
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac876e32de4b..48edd0c9e4bc 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev)
 
 static void pci_release_capabilities(struct pci_dev *dev)
 {
+	pci_aer_exit(dev);
 	pci_vpd_release(dev);
 	pci_iov_release(dev);
 	pci_free_cap_save_buffers(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b4ffea05c999..6bc0aa0fc33f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -299,6 +299,7 @@ struct pci_dev {
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 #ifdef CONFIG_PCIEAER
 	u16		aer_cap;	/* AER capability offset */
+	struct aer_stats *aer_stats;	/* AER stats for this device */
 #endif
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
-- 
cgit v1.2.3


From 3eca993740b8eb40f514b90b1877a4dbcf0a6710 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 19 Jul 2018 16:55:34 -0400
Subject: timekeeping: Replace read_boot_clock64() with
 read_persistent_wall_and_boot_offset()

If architecture does not support exact boot time, it is challenging to
estimate boot time without having a reference to the current persistent
clock value. Yet, it cannot read the persistent clock time again, because
this may lead to math discrepancies with the caller of read_boot_clock64()
who have read the persistent clock at a different time.

This is why it is better to provide two values simultaneously: the
persistent clock value, and the boot time.

Replace read_boot_clock64() with:
read_persistent_wall_and_boot_offset(wall_time, boot_offset)

Where wall_time is returned by read_persistent_clock() And boot_offset is
wall_time - boot time, which defaults to 0.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: steven.sistare@oracle.com
Cc: daniel.m.jordan@oracle.com
Cc: linux@armlinux.org.uk
Cc: schwidefsky@de.ibm.com
Cc: heiko.carstens@de.ibm.com
Cc: john.stultz@linaro.org
Cc: sboyd@codeaurora.org
Cc: hpa@zytor.com
Cc: douly.fnst@cn.fujitsu.com
Cc: peterz@infradead.org
Cc: prarit@redhat.com
Cc: feng.tang@intel.com
Cc: pmladek@suse.com
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: linux-s390@vger.kernel.org
Cc: boris.ostrovsky@oracle.com
Cc: jgross@suse.com
Cc: pbonzini@redhat.com
Link: https://lkml.kernel.org/r/20180719205545.16512-16-pasha.tatashin@oracle.com
---
 include/linux/timekeeping.h |  3 ++-
 kernel/time/timekeeping.c   | 59 +++++++++++++++++++++++----------------------
 2 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 86bc2026efce..686bc27acef0 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -243,7 +243,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
 extern int persistent_clock_is_local;
 
 extern void read_persistent_clock64(struct timespec64 *ts);
-extern void read_boot_clock64(struct timespec64 *ts);
+void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock,
+					   struct timespec64 *boot_offset);
 extern int update_persistent_clock64(struct timespec64 now);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4786df904c22..cb738f825c12 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -17,6 +17,7 @@
 #include <linux/nmi.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/clock.h>
 #include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
@@ -1496,18 +1497,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
 }
 
 /**
- * read_boot_clock64 -  Return time of the system start.
+ * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
+ *                                        from the boot.
  *
  * Weak dummy function for arches that do not yet support it.
- * Function to read the exact time the system has been started.
- * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
- *
- *  XXX - Do be sure to remove it once all arches implement it.
+ * wall_time	- current time as returned by persistent clock
+ * boot_offset	- offset that is defined as wall_time - boot_time
+ *		  default to 0.
  */
-void __weak read_boot_clock64(struct timespec64 *ts)
+void __weak __init
+read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
+				     struct timespec64 *boot_offset)
 {
-	ts->tv_sec = 0;
-	ts->tv_nsec = 0;
+	read_persistent_clock64(wall_time);
+	*boot_offset = (struct timespec64){0};
 }
 
 /* Flag for if timekeeping_resume() has injected sleeptime */
@@ -1521,28 +1524,29 @@ static bool persistent_clock_exists;
  */
 void __init timekeeping_init(void)
 {
+	struct timespec64 wall_time, boot_offset, wall_to_mono;
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock;
 	unsigned long flags;
-	struct timespec64 now, boot, tmp;
-
-	read_persistent_clock64(&now);
-	if (!timespec64_valid_strict(&now)) {
-		pr_warn("WARNING: Persistent clock returned invalid value!\n"
-			"         Check your CMOS/BIOS settings.\n");
-		now.tv_sec = 0;
-		now.tv_nsec = 0;
-	} else if (now.tv_sec || now.tv_nsec)
-		persistent_clock_exists = true;
 
-	read_boot_clock64(&boot);
-	if (!timespec64_valid_strict(&boot)) {
-		pr_warn("WARNING: Boot clock returned invalid value!\n"
-			"         Check your CMOS/BIOS settings.\n");
-		boot.tv_sec = 0;
-		boot.tv_nsec = 0;
+	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
+	if (timespec64_valid_strict(&wall_time) &&
+	    timespec64_to_ns(&wall_time) > 0) {
+		persistent_clock_exists = true;
+	} else {
+		pr_warn("Persistent clock returned invalid value");
+		wall_time = (struct timespec64){0};
 	}
 
+	if (timespec64_compare(&wall_time, &boot_offset) < 0)
+		boot_offset = (struct timespec64){0};
+
+	/*
+	 * We want set wall_to_mono, so the following is true:
+	 * wall time + wall_to_mono = boot time
+	 */
+	wall_to_mono = timespec64_sub(boot_offset, wall_time);
+
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&tk_core.seq);
 	ntp_init();
@@ -1552,13 +1556,10 @@ void __init timekeeping_init(void)
 		clock->enable(clock);
 	tk_setup_internals(tk, clock);
 
-	tk_set_xtime(tk, &now);
+	tk_set_xtime(tk, &wall_time);
 	tk->raw_sec = 0;
-	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
-		boot = tk_xtime(tk);
 
-	set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
-	tk_set_wall_to_mono(tk, tmp);
+	tk_set_wall_to_mono(tk, wall_to_mono);
 
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 
-- 
cgit v1.2.3


From 5d2a4e91a541cb04d20d11602f0f9340291322ac Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 19 Jul 2018 16:55:41 -0400
Subject: sched/clock: Move sched clock initialization and merge with generic
 clock

sched_clock_postinit() initializes a generic clock on systems where no
other clock is provided. This function may be called only after
timekeeping_init().

Rename sched_clock_postinit to generic_clock_inti() and call it from
sched_clock_init(). Move the call for sched_clock_init() until after
time_init().

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: steven.sistare@oracle.com
Cc: daniel.m.jordan@oracle.com
Cc: linux@armlinux.org.uk
Cc: schwidefsky@de.ibm.com
Cc: heiko.carstens@de.ibm.com
Cc: john.stultz@linaro.org
Cc: sboyd@codeaurora.org
Cc: hpa@zytor.com
Cc: douly.fnst@cn.fujitsu.com
Cc: prarit@redhat.com
Cc: feng.tang@intel.com
Cc: pmladek@suse.com
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: linux-s390@vger.kernel.org
Cc: boris.ostrovsky@oracle.com
Cc: jgross@suse.com
Cc: pbonzini@redhat.com
Link: https://lkml.kernel.org/r/20180719205545.16512-23-pasha.tatashin@oracle.com
---
 include/linux/sched_clock.h |  5 ++---
 init/main.c                 |  4 ++--
 kernel/sched/clock.c        | 27 +++++++++++++++++----------
 kernel/sched/core.c         |  1 -
 kernel/time/sched_clock.c   |  2 +-
 5 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 411b52e424e1..abe28d5cb3f4 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -9,17 +9,16 @@
 #define LINUX_SCHED_CLOCK
 
 #ifdef CONFIG_GENERIC_SCHED_CLOCK
-extern void sched_clock_postinit(void);
+extern void generic_sched_clock_init(void);
 
 extern void sched_clock_register(u64 (*read)(void), int bits,
 				 unsigned long rate);
 #else
-static inline void sched_clock_postinit(void) { }
+static inline void generic_sched_clock_init(void) { }
 
 static inline void sched_clock_register(u64 (*read)(void), int bits,
 					unsigned long rate)
 {
-	;
 }
 #endif
 
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..162d931c9511 100644
--- a/init/main.c
+++ b/init/main.c
@@ -79,7 +79,7 @@
 #include <linux/pti.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
-#include <linux/sched_clock.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/context_tracking.h>
@@ -642,7 +642,7 @@ asmlinkage __visible void __init start_kernel(void)
 	softirq_init();
 	timekeeping_init();
 	time_init();
-	sched_clock_postinit();
+	sched_clock_init();
 	printk_safe_init();
 	perf_event_init();
 	profile_init();
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 10c83e73837a..0e9dbb2d9aea 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -53,6 +53,7 @@
  *
  */
 #include "sched.h"
+#include <linux/sched_clock.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -68,11 +69,6 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 /*
  * We must start with !__sched_clock_stable because the unstable -> stable
@@ -199,6 +195,15 @@ void clear_sched_clock_stable(void)
 		__clear_sched_clock_stable();
 }
 
+static void __sched_clock_gtod_offset(void)
+{
+	__gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns();
+}
+
+void __init sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
 /*
  * We run this as late_initcall() such that it runs after all built-in drivers,
  * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
@@ -385,8 +390,6 @@ void sched_clock_tick(void)
 
 void sched_clock_tick_stable(void)
 {
-	u64 gtod, clock;
-
 	if (!sched_clock_stable())
 		return;
 
@@ -398,9 +401,7 @@ void sched_clock_tick_stable(void)
 	 * TSC to be unstable, any computation will be computing crap.
 	 */
 	local_irq_disable();
-	gtod = ktime_get_ns();
-	clock = sched_clock();
-	__gtod_offset = (clock + __sched_clock_offset) - gtod;
+	__sched_clock_gtod_offset();
 	local_irq_enable();
 }
 
@@ -434,6 +435,12 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
+void __init sched_clock_init(void)
+{
+	sched_clock_running = 1;
+	generic_sched_clock_init();
+}
+
 u64 sched_clock_cpu(int cpu)
 {
 	if (unlikely(!sched_clock_running))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..552406e9713b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5954,7 +5954,6 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
-	sched_clock_init();
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 2d8f05aad442..cbc72c2c1fca 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	pr_debug("Registered %pF as sched_clock source\n", read);
 }
 
-void __init sched_clock_postinit(void)
+void __init generic_sched_clock_init(void)
 {
 	/*
 	 * If no sched_clock() function has been provided at that point,
-- 
cgit v1.2.3


From 381634cad15b711e033a2638d558232b60f753f6 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 19 Jul 2018 18:04:11 -0500
Subject: PCI: Hide pci_reset_bridge_secondary_bus() from drivers

Rename pci_reset_bridge_secondary_bus() to pci_bridge_secondary_bus_reset()
and move the declaration from linux/pci.h to drivers/pci.h to be used
internally in PCI directory only.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/hotplug/pciehp_hpc.c |  2 +-
 drivers/pci/pci.c                | 11 +++++------
 drivers/pci/pci.h                |  1 +
 drivers/pci/pcie/aer.c           |  2 +-
 drivers/pci/pcie/err.c           |  2 +-
 include/linux/pci.h              |  1 -
 6 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index bbaa2114f953..8dae23221344 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -746,7 +746,7 @@ int pciehp_reset_slot(struct slot *slot, int probe)
 	if (pciehp_poll_mode)
 		del_timer_sync(&ctrl->poll_timer);
 
-	rc = pci_reset_bridge_secondary_bus(ctrl->pcie->port);
+	rc = pci_bridge_secondary_bus_reset(ctrl->pcie->port);
 
 	pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, stat_mask);
 	pcie_write_cmd_nowait(ctrl, ctrl_mask, ctrl_mask);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 98d149070205..236220cb0f77 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4224,19 +4224,18 @@ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev)
 }
 
 /**
- * pci_reset_bridge_secondary_bus - Reset the secondary bus on a PCI bridge.
+ * pci_bridge_secondary_bus_reset - Reset the secondary bus on a PCI bridge.
  * @dev: Bridge device
  *
  * Use the bridge control register to assert reset on the secondary bus.
  * Devices on the secondary bus are left in power-on state.
  */
-int pci_reset_bridge_secondary_bus(struct pci_dev *dev)
+int pci_bridge_secondary_bus_reset(struct pci_dev *dev)
 {
 	pcibios_reset_secondary_bus(dev);
 
 	return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS);
 }
-EXPORT_SYMBOL_GPL(pci_reset_bridge_secondary_bus);
 
 static int pci_parent_bus_reset(struct pci_dev *dev, int probe)
 {
@@ -4253,7 +4252,7 @@ static int pci_parent_bus_reset(struct pci_dev *dev, int probe)
 	if (probe)
 		return 0;
 
-	return pci_reset_bridge_secondary_bus(dev->bus->self);
+	return pci_bridge_secondary_bus_reset(dev->bus->self);
 }
 
 static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, int probe)
@@ -4860,7 +4859,7 @@ static int pci_bus_reset(struct pci_bus *bus, int probe)
 
 	might_sleep();
 
-	ret = pci_reset_bridge_secondary_bus(bus->self);
+	ret = pci_bridge_secondary_bus_reset(bus->self);
 
 	pci_bus_unlock(bus);
 
@@ -4924,7 +4923,7 @@ int pci_try_reset_bus(struct pci_bus *bus)
 
 	if (pci_bus_trylock(bus)) {
 		might_sleep();
-		rc = pci_reset_bridge_secondary_bus(bus->self);
+		rc = pci_bridge_secondary_bus_reset(bus->self);
 		pci_bus_unlock(bus);
 	} else
 		rc = -EAGAIN;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index c358e7a07f3f..f784263ad587 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -33,6 +33,7 @@ int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai,
 		  enum pci_mmap_api mmap_api);
 
 int pci_probe_reset_function(struct pci_dev *dev);
+int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
 
 /**
  * struct pci_platform_pm_ops - Firmware PM callbacks
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index bde723db3d65..8c12efca9259 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1314,7 +1314,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
 	pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
-	rc = pci_reset_bridge_secondary_bus(dev);
+	rc = pci_bridge_secondary_bus_reset(dev);
 	pci_printk(KERN_DEBUG, dev, "Root Port link has been reset\n");
 
 	/* Clear Root Error Status */
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 03075cff86f4..ae72f88d3ca2 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -177,7 +177,7 @@ static pci_ers_result_t default_reset_link(struct pci_dev *dev)
 {
 	int rc;
 
-	rc = pci_reset_bridge_secondary_bus(dev);
+	rc = pci_bridge_secondary_bus_reset(dev);
 	pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
 	return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6ba818449095..03520ff23d73 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1103,7 +1103,6 @@ int pci_reset_bus(struct pci_bus *bus);
 int pci_try_reset_bus(struct pci_bus *bus);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
-int pci_reset_bridge_secondary_bus(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
-- 
cgit v1.2.3


From 811c5cb37df46b0cd714dbd053d19cdb97d08cff Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 19 Jul 2018 18:04:12 -0500
Subject: PCI: Unify try slot and bus reset API

Drivers are expected to call pci_try_reset_slot() or pci_try_reset_bus() by
querying if a system supports hotplug or not.  A survey showed that most
drivers don't do this and we are leaking hotplug capability to the user.

Hide pci_try_slot_reset() from drivers and embed into pci_try_bus_reset().
Change pci_try_reset_bus() parameter from struct pci_bus to struct pci_dev.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/infiniband/hw/hfi1/pcie.c |  2 +-
 drivers/pci/pci.c                 | 21 ++++++++++++++++-----
 drivers/vfio/pci/vfio_pci.c       |  6 ++----
 include/linux/pci.h               |  3 +--
 4 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 4570c4dc93d9..df4f2d390be8 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -905,7 +905,7 @@ static int trigger_sbr(struct hfi1_devdata *dd)
 	 * delay after a reset is required.  Per spec requirements,
 	 * the link is either working or not after that point.
 	 */
-	return pci_try_reset_bus(dev->bus);
+	return pci_try_reset_bus(dev);
 }
 
 /*
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 236220cb0f77..a31e6dbf21c3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4817,12 +4817,12 @@ int pci_reset_slot(struct pci_slot *slot)
 EXPORT_SYMBOL_GPL(pci_reset_slot);
 
 /**
- * pci_try_reset_slot - Try to reset a PCI slot
+ * __pci_try_reset_slot - Try to reset a PCI slot
  * @slot: PCI slot to reset
  *
  * Same as above except return -EAGAIN if the slot cannot be locked
  */
-int pci_try_reset_slot(struct pci_slot *slot)
+static int __pci_try_reset_slot(struct pci_slot *slot)
 {
 	int rc;
 
@@ -4843,7 +4843,6 @@ int pci_try_reset_slot(struct pci_slot *slot)
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(pci_try_reset_slot);
 
 static int pci_bus_reset(struct pci_bus *bus, int probe)
 {
@@ -4906,12 +4905,12 @@ int pci_reset_bus(struct pci_bus *bus)
 EXPORT_SYMBOL_GPL(pci_reset_bus);
 
 /**
- * pci_try_reset_bus - Try to reset a PCI bus
+ * __pci_try_reset_bus - Try to reset a PCI bus
  * @bus: top level PCI bus to reset
  *
  * Same as above except return -EAGAIN if the bus cannot be locked
  */
-int pci_try_reset_bus(struct pci_bus *bus)
+static int __pci_try_reset_bus(struct pci_bus *bus)
 {
 	int rc;
 
@@ -4932,6 +4931,18 @@ int pci_try_reset_bus(struct pci_bus *bus)
 
 	return rc;
 }
+
+/**
+ * pci_try_reset_bus - Try to reset a PCI bus
+ * @pdev: top level PCI device to reset via slot/bus
+ *
+ * Same as above except return -EAGAIN if the bus cannot be locked
+ */
+int pci_try_reset_bus(struct pci_dev *pdev)
+{
+	return pci_probe_reset_slot(pdev->slot) ?
+	    __pci_try_reset_slot(pdev->slot) : __pci_try_reset_bus(pdev->bus);
+}
 EXPORT_SYMBOL_GPL(pci_try_reset_bus);
 
 /**
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b423a309a6e0..71018ec9065b 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1010,8 +1010,7 @@ reset_info_exit:
 						    &info, slot);
 		if (!ret)
 			/* User has access, do the reset */
-			ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
-				     pci_try_reset_bus(vdev->pdev->bus);
+			ret = pci_try_reset_bus(vdev->pdev);
 
 hot_reset_release:
 		for (i--; i >= 0; i--)
@@ -1373,8 +1372,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 	}
 
 	if (needs_reset)
-		ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
-			     pci_try_reset_bus(vdev->pdev->bus);
+		ret = pci_try_reset_bus(vdev->pdev);
 
 put_devs:
 	for (i = 0; i < devs.cur_index; i++) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 03520ff23d73..19fb82559145 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1097,10 +1097,9 @@ int pci_reset_function_locked(struct pci_dev *dev);
 int pci_try_reset_function(struct pci_dev *dev);
 int pci_probe_reset_slot(struct pci_slot *slot);
 int pci_reset_slot(struct pci_slot *slot);
-int pci_try_reset_slot(struct pci_slot *slot);
 int pci_probe_reset_bus(struct pci_bus *bus);
 int pci_reset_bus(struct pci_bus *bus);
-int pci_try_reset_bus(struct pci_bus *bus);
+int pci_try_reset_bus(struct pci_dev *dev);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
-- 
cgit v1.2.3


From fe32e2fa656c29d5d25f959f8e6168ac405d9ab4 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 19 Jul 2018 18:04:14 -0500
Subject: PCI: Deprecate pci_reset_bus() and pci_reset_slot() functions

pci_reset_bus() and pci_reset_slot() functions are not being used by any
code.  Remove them from the kernel in favor of pci_try_reset_bus() and
pci_try_reset_slot() functions.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 53 +----------------------------------------------------
 include/linux/pci.h |  2 --
 2 files changed, 1 insertion(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a31e6dbf21c3..8ee9f386c1ee 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4784,7 +4784,7 @@ int pci_probe_reset_slot(struct pci_slot *slot)
 EXPORT_SYMBOL_GPL(pci_probe_reset_slot);
 
 /**
- * pci_reset_slot - reset a PCI slot
+ * __pci_try_reset_slot - Try to reset a PCI slot
  * @slot: PCI slot to reset
  *
  * A PCI bus may host multiple slots, each slot may support a reset mechanism
@@ -4796,30 +4796,6 @@ EXPORT_SYMBOL_GPL(pci_probe_reset_slot);
  * through this function.  PCI config space of all devices in the slot and
  * behind the slot is saved before and restored after reset.
  *
- * Return 0 on success, non-zero on error.
- */
-int pci_reset_slot(struct pci_slot *slot)
-{
-	int rc;
-
-	rc = pci_slot_reset(slot, 1);
-	if (rc)
-		return rc;
-
-	pci_slot_save_and_disable(slot);
-
-	rc = pci_slot_reset(slot, 0);
-
-	pci_slot_restore(slot);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(pci_reset_slot);
-
-/**
- * __pci_try_reset_slot - Try to reset a PCI slot
- * @slot: PCI slot to reset
- *
  * Same as above except return -EAGAIN if the slot cannot be locked
  */
 static int __pci_try_reset_slot(struct pci_slot *slot)
@@ -4877,33 +4853,6 @@ int pci_probe_reset_bus(struct pci_bus *bus)
 }
 EXPORT_SYMBOL_GPL(pci_probe_reset_bus);
 
-/**
- * pci_reset_bus - reset a PCI bus
- * @bus: top level PCI bus to reset
- *
- * Do a bus reset on the given bus and any subordinate buses, saving
- * and restoring state of all devices.
- *
- * Return 0 on success, non-zero on error.
- */
-int pci_reset_bus(struct pci_bus *bus)
-{
-	int rc;
-
-	rc = pci_bus_reset(bus, 1);
-	if (rc)
-		return rc;
-
-	pci_bus_save_and_disable(bus);
-
-	rc = pci_bus_reset(bus, 0);
-
-	pci_bus_restore(bus);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(pci_reset_bus);
-
 /**
  * __pci_try_reset_bus - Try to reset a PCI bus
  * @bus: top level PCI bus to reset
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 19fb82559145..cf53ecb50789 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1096,9 +1096,7 @@ int pci_reset_function(struct pci_dev *dev);
 int pci_reset_function_locked(struct pci_dev *dev);
 int pci_try_reset_function(struct pci_dev *dev);
 int pci_probe_reset_slot(struct pci_slot *slot);
-int pci_reset_slot(struct pci_slot *slot);
 int pci_probe_reset_bus(struct pci_bus *bus);
-int pci_reset_bus(struct pci_bus *bus);
 int pci_try_reset_bus(struct pci_dev *dev);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
-- 
cgit v1.2.3


From c6a44ba950d147e15fe6dab6455a52f91d8fe625 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Thu, 19 Jul 2018 18:04:15 -0500
Subject: PCI: Rename pci_try_reset_bus() to pci_reset_bus()

Now that the old implementation of pci_reset_bus() is gone, replace
pci_try_reset_bus() with pci_reset_bus().

Compared to the old implementation, new code will fail immmediately with
-EAGAIN if object lock cannot be obtained.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/infiniband/hw/hfi1/pcie.c |  2 +-
 drivers/pci/pci.c                 | 16 ++++++++--------
 drivers/vfio/pci/vfio_pci.c       |  4 ++--
 include/linux/pci.h               |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index df4f2d390be8..baf7c324f7b8 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -905,7 +905,7 @@ static int trigger_sbr(struct hfi1_devdata *dd)
 	 * delay after a reset is required.  Per spec requirements,
 	 * the link is either working or not after that point.
 	 */
-	return pci_try_reset_bus(dev);
+	return pci_reset_bus(dev);
 }
 
 /*
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8ee9f386c1ee..d123c2b173da 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4784,7 +4784,7 @@ int pci_probe_reset_slot(struct pci_slot *slot)
 EXPORT_SYMBOL_GPL(pci_probe_reset_slot);
 
 /**
- * __pci_try_reset_slot - Try to reset a PCI slot
+ * __pci_reset_slot - Try to reset a PCI slot
  * @slot: PCI slot to reset
  *
  * A PCI bus may host multiple slots, each slot may support a reset mechanism
@@ -4798,7 +4798,7 @@ EXPORT_SYMBOL_GPL(pci_probe_reset_slot);
  *
  * Same as above except return -EAGAIN if the slot cannot be locked
  */
-static int __pci_try_reset_slot(struct pci_slot *slot)
+static int __pci_reset_slot(struct pci_slot *slot)
 {
 	int rc;
 
@@ -4854,12 +4854,12 @@ int pci_probe_reset_bus(struct pci_bus *bus)
 EXPORT_SYMBOL_GPL(pci_probe_reset_bus);
 
 /**
- * __pci_try_reset_bus - Try to reset a PCI bus
+ * __pci_reset_bus - Try to reset a PCI bus
  * @bus: top level PCI bus to reset
  *
  * Same as above except return -EAGAIN if the bus cannot be locked
  */
-static int __pci_try_reset_bus(struct pci_bus *bus)
+static int __pci_reset_bus(struct pci_bus *bus)
 {
 	int rc;
 
@@ -4882,17 +4882,17 @@ static int __pci_try_reset_bus(struct pci_bus *bus)
 }
 
 /**
- * pci_try_reset_bus - Try to reset a PCI bus
+ * pci_reset_bus - Try to reset a PCI bus
  * @pdev: top level PCI device to reset via slot/bus
  *
  * Same as above except return -EAGAIN if the bus cannot be locked
  */
-int pci_try_reset_bus(struct pci_dev *pdev)
+int pci_reset_bus(struct pci_dev *pdev)
 {
 	return pci_probe_reset_slot(pdev->slot) ?
-	    __pci_try_reset_slot(pdev->slot) : __pci_try_reset_bus(pdev->bus);
+	    __pci_reset_slot(pdev->slot) : __pci_reset_bus(pdev->bus);
 }
-EXPORT_SYMBOL_GPL(pci_try_reset_bus);
+EXPORT_SYMBOL_GPL(pci_reset_bus);
 
 /**
  * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 71018ec9065b..345c0dc8a6dc 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1010,7 +1010,7 @@ reset_info_exit:
 						    &info, slot);
 		if (!ret)
 			/* User has access, do the reset */
-			ret = pci_try_reset_bus(vdev->pdev);
+			ret = pci_reset_bus(vdev->pdev);
 
 hot_reset_release:
 		for (i--; i >= 0; i--)
@@ -1372,7 +1372,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 	}
 
 	if (needs_reset)
-		ret = pci_try_reset_bus(vdev->pdev);
+		ret = pci_reset_bus(vdev->pdev);
 
 put_devs:
 	for (i = 0; i < devs.cur_index; i++) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cf53ecb50789..307b336496f6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1097,7 +1097,7 @@ int pci_reset_function_locked(struct pci_dev *dev);
 int pci_try_reset_function(struct pci_dev *dev);
 int pci_probe_reset_slot(struct pci_slot *slot);
 int pci_probe_reset_bus(struct pci_bus *bus);
-int pci_try_reset_bus(struct pci_dev *dev);
+int pci_reset_bus(struct pci_dev *dev);
 void pci_reset_secondary_bus(struct pci_dev *dev);
 void pcibios_reset_secondary_bus(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
-- 
cgit v1.2.3


From b976690f5db26fbc7c2be413bfa0fbd270547a94 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 18 Jul 2018 11:41:06 +0200
Subject: x86/mm/pti: Introduce pti_finalize()

Introduce a new function to finalize the kernel mappings for the userspace
page-table after all ro/nx protections have been applied to the kernel
mappings.

Also move the call to pti_clone_kernel_text() to that function so that it
will run on 32 bit kernels too.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Pavel Machek <pavel@ucw.cz>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: linux-mm@kvack.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Waiman Long <llong@redhat.com>
Cc: "David H . Gutteridge" <dhgutteridge@sympatico.ca>
Cc: joro@8bytes.org
Link: https://lkml.kernel.org/r/1531906876-13451-30-git-send-email-joro@8bytes.org
---
 arch/x86/include/asm/pti.h |  3 +--
 arch/x86/mm/init_64.c      |  6 ------
 arch/x86/mm/pti.c          | 14 +++++++++++++-
 include/linux/pti.h        |  1 +
 init/main.c                |  7 +++++++
 5 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index 38a17f1d5c9d..5df09a0b80b8 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -6,10 +6,9 @@
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern void pti_init(void);
 extern void pti_check_boottime_disable(void);
-extern void pti_clone_kernel_text(void);
+extern void pti_finalize(void);
 #else
 static inline void pti_check_boottime_disable(void) { }
-static inline void pti_clone_kernel_text(void) { }
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..9b19f9a8948e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1291,12 +1291,6 @@ void mark_rodata_ro(void)
 			(unsigned long) __va(__pa_symbol(_sdata)));
 
 	debug_checkwx();
-
-	/*
-	 * Do this after all of the manipulation of the
-	 * kernel text page tables are complete.
-	 */
-	pti_clone_kernel_text();
 }
 
 int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index e41ee93c430d..fcfb815d420f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -462,7 +462,7 @@ static inline bool pti_kernel_image_global_ok(void)
  * For some configurations, map all of kernel text into the user page
  * tables.  This reduces TLB misses, especially on non-PCID systems.
  */
-void pti_clone_kernel_text(void)
+static void pti_clone_kernel_text(void)
 {
 	/*
 	 * rodata is part of the kernel image and is normally
@@ -526,3 +526,15 @@ void __init pti_init(void)
 	pti_setup_espfix64();
 	pti_setup_vsyscall();
 }
+
+/*
+ * Finalize the kernel mappings in the userspace page-table.
+ */
+void pti_finalize(void)
+{
+	/*
+	 * Do this after all of the manipulation of the
+	 * kernel text page tables are complete.
+	 */
+	pti_clone_kernel_text();
+}
diff --git a/include/linux/pti.h b/include/linux/pti.h
index 0174883a935a..1a941efcaa62 100644
--- a/include/linux/pti.h
+++ b/include/linux/pti.h
@@ -6,6 +6,7 @@
 #include <asm/pti.h>
 #else
 static inline void pti_init(void) { }
+static inline void pti_finalize(void) { }
 #endif
 
 #endif
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..fcfef46c935a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1065,6 +1065,13 @@ static int __ref kernel_init(void *unused)
 	jump_label_invalidate_initmem();
 	free_initmem();
 	mark_readonly();
+
+	/*
+	 * Kernel mappings are now finalized - update the userspace page-table
+	 * to finalize PTI.
+	 */
+	pti_finalize();
+
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
-- 
cgit v1.2.3


From 985e695074d35768cb04d65f58bca45f7bf1a99d Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Fri, 13 Jul 2018 14:06:42 +0200
Subject: timekeeping/ntp: Constify some function arguments

Add 'const' to some function arguments and variables to make it easier
to read the code.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
[jstultz: Also fixup pre-existing checkpatch warnings for
 prototype arguments with no variable name]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h        |  2 +-
 kernel/time/ntp.c                  |  6 +++---
 kernel/time/ntp_internal.h         |  4 ++--
 kernel/time/timekeeping.c          | 29 +++++++++++++++--------------
 kernel/time/timekeeping_debug.c    |  2 +-
 kernel/time/timekeeping_internal.h |  2 +-
 6 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 86bc2026efce..edace6b656e9 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -177,7 +177,7 @@ static inline time64_t ktime_get_clocktai_seconds(void)
 extern bool timekeeping_rtc_skipsuspend(void);
 extern bool timekeeping_rtc_skipresume(void);
 
-extern void timekeeping_inject_sleeptime64(struct timespec64 *delta);
+extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);
 
 /*
  * struct system_time_snapshot - simultaneous raw/real time capture with
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a627cae8baab..c5e0cba3b39c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -642,7 +642,7 @@ void ntp_notify_cmos_timer(void)
 /*
  * Propagate a new txc->status value into the NTP state:
  */
-static inline void process_adj_status(struct timex *txc)
+static inline void process_adj_status(const struct timex *txc)
 {
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
@@ -665,7 +665,7 @@ static inline void process_adj_status(struct timex *txc)
 }
 
 
-static inline void process_adjtimex_modes(struct timex *txc, s32 *time_tai)
+static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai)
 {
 	if (txc->modes & ADJ_STATUS)
 		process_adj_status(txc);
@@ -716,7 +716,7 @@ static inline void process_adjtimex_modes(struct timex *txc, s32 *time_tai)
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
-int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
+int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
 {
 	int result;
 
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 909bd1f1bfb1..c24b0e13f011 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,6 +8,6 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
-extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
+extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai);
+extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ad779c2ec53a..7033ac1fee65 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -105,7 +105,7 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
 	}
 }
 
-static inline struct timespec64 tk_xtime(struct timekeeper *tk)
+static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
 {
 	struct timespec64 ts;
 
@@ -162,7 +162,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
  * a read of the fast-timekeeper tkrs (which is protected by its own locking
  * and update logic).
  */
-static inline u64 tk_clock_read(struct tk_read_base *tkr)
+static inline u64 tk_clock_read(const struct tk_read_base *tkr)
 {
 	struct clocksource *clock = READ_ONCE(tkr->clock);
 
@@ -211,7 +211,7 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 	}
 }
 
-static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	u64 now, last, mask, max, delta;
@@ -255,7 +255,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
 static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 {
 }
-static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
 {
 	u64 cycle_now, delta;
 
@@ -352,7 +352,7 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
-static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
+static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
 {
 	u64 nsec;
 
@@ -363,7 +363,7 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
 	return nsec + arch_gettimeoffset();
 }
 
-static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
 	u64 delta;
 
@@ -371,7 +371,7 @@ static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
 	return timekeeping_delta_to_ns(tkr, delta);
 }
 
-static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
 {
 	u64 delta;
 
@@ -394,7 +394,8 @@ static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles)
  * slightly wrong timestamp (a few nanoseconds). See
  * @ktime_get_mono_fast_ns.
  */
-static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
+static void update_fast_timekeeper(const struct tk_read_base *tkr,
+				   struct tk_fast *tkf)
 {
 	struct tk_read_base *base = tkf->base;
 
@@ -549,10 +550,10 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
  * number of cycles every time until timekeeping is resumed at which time the
  * proper readout base for the fast timekeeper will be restored automatically.
  */
-static void halt_fast_timekeeper(struct timekeeper *tk)
+static void halt_fast_timekeeper(const struct timekeeper *tk)
 {
 	static struct tk_read_base tkr_dummy;
-	struct tk_read_base *tkr = &tk->tkr_mono;
+	const struct tk_read_base *tkr = &tk->tkr_mono;
 
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tk_clock_read(tkr);
@@ -1277,7 +1278,7 @@ EXPORT_SYMBOL(do_settimeofday64);
  *
  * Adds or subtracts an offset value from the current time.
  */
-static int timekeeping_inject_offset(struct timespec64 *ts)
+static int timekeeping_inject_offset(const struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
@@ -1585,7 +1586,7 @@ static struct timespec64 timekeeping_suspend_time;
  * adds the sleep offset to the timekeeping variables.
  */
 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
-					   struct timespec64 *delta)
+					   const struct timespec64 *delta)
 {
 	if (!timespec64_valid_strict(delta)) {
 		printk_deferred(KERN_WARNING
@@ -1646,7 +1647,7 @@ bool timekeeping_rtc_skipsuspend(void)
  * This function should only be called by rtc_resume(), and allows
  * a suspend offset to be injected into the timekeeping values.
  */
-void timekeeping_inject_sleeptime64(struct timespec64 *delta)
+void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
@@ -2240,7 +2241,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 /**
  * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
  */
-static int timekeeping_validate_timex(struct timex *txc)
+static int timekeeping_validate_timex(const struct timex *txc)
 {
 	if (txc->modes & ADJ_ADJTIME) {
 		/* singleshot must not be used with any other mode bits */
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 0754cadfa9e6..238e4be60229 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -70,7 +70,7 @@ static int __init tk_debug_sleep_time_init(void)
 }
 late_initcall(tk_debug_sleep_time_init);
 
-void tk_debug_account_sleep_time(struct timespec64 *t)
+void tk_debug_account_sleep_time(const struct timespec64 *t)
 {
 	/* Cap bin index so we don't overflow the array */
 	int bin = min(fls(t->tv_sec), NUM_BINS-1);
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index cf5c0828ee31..bcbb52db2256 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -8,7 +8,7 @@
 #include <linux/time.h>
 
 #ifdef CONFIG_DEBUG_FS
-extern void tk_debug_account_sleep_time(struct timespec64 *t);
+extern void tk_debug_account_sleep_time(const struct timespec64 *t);
 #else
 #define tk_debug_account_sleep_time(x)
 #endif
-- 
cgit v1.2.3


From 39232ed5a1793f67b11430c43ed8a9ed6e96c6eb Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 17 Jul 2018 15:55:16 +0800
Subject: time: Introduce one suspend clocksource to compensate the suspend
 time

On some hardware with multiple clocksources, we have coarse grained
clocksources that support the CLOCK_SOURCE_SUSPEND_NONSTOP flag, but
which are less than ideal for timekeeping whereas other clocksources
can be better candidates but halt on suspend.

Currently, the timekeeping core only supports timing suspend using
CLOCK_SOURCE_SUSPEND_NONSTOP clocksources if that clocksource is the
current clocksource for timekeeping.

As a result, some architectures try to implement read_persistent_clock64()
using those non-stop clocksources, but isn't really ideal, which will
introduce more duplicate code. To fix this, provide logic to allow a
registered SUSPEND_NONSTOP clocksource, which isn't the current
clocksource, to be used to calculate the suspend time.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
[jstultz: minor tweaks to merge with previous resume changes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/clocksource.h |   3 +
 kernel/time/clocksource.c   | 149 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c   |  22 ++++---
 3 files changed, 166 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 7dff1963c185..308918928767 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -194,6 +194,9 @@ extern void clocksource_suspend(void);
 extern void clocksource_resume(void);
 extern struct clocksource * __init clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
+extern void
+clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles);
+extern u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 now);
 
 extern u64
 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f89a78e2792b..f74fb00d8064 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -94,6 +94,8 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
 /*[Clocksource internal variables]---------
  * curr_clocksource:
  *	currently selected clocksource.
+ * suspend_clocksource:
+ *	used to calculate the suspend time.
  * clocksource_list:
  *	linked list with the registered clocksources
  * clocksource_mutex:
@@ -102,10 +104,12 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
  *	Name of the user-specified clocksource.
  */
 static struct clocksource *curr_clocksource;
+static struct clocksource *suspend_clocksource;
 static LIST_HEAD(clocksource_list);
 static DEFINE_MUTEX(clocksource_mutex);
 static char override_name[CS_NAME_LEN];
 static int finished_booting;
+static u64 suspend_start;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
@@ -447,6 +451,140 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
 
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 
+static bool clocksource_is_suspend(struct clocksource *cs)
+{
+	return cs == suspend_clocksource;
+}
+
+static void __clocksource_suspend_select(struct clocksource *cs)
+{
+	/*
+	 * Skip the clocksource which will be stopped in suspend state.
+	 */
+	if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP))
+		return;
+
+	/*
+	 * The nonstop clocksource can be selected as the suspend clocksource to
+	 * calculate the suspend time, so it should not supply suspend/resume
+	 * interfaces to suspend the nonstop clocksource when system suspends.
+	 */
+	if (cs->suspend || cs->resume) {
+		pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n",
+			cs->name);
+	}
+
+	/* Pick the best rating. */
+	if (!suspend_clocksource || cs->rating > suspend_clocksource->rating)
+		suspend_clocksource = cs;
+}
+
+/**
+ * clocksource_suspend_select - Select the best clocksource for suspend timing
+ * @fallback:	if select a fallback clocksource
+ */
+static void clocksource_suspend_select(bool fallback)
+{
+	struct clocksource *cs, *old_suspend;
+
+	old_suspend = suspend_clocksource;
+	if (fallback)
+		suspend_clocksource = NULL;
+
+	list_for_each_entry(cs, &clocksource_list, list) {
+		/* Skip current if we were requested for a fallback. */
+		if (fallback && cs == old_suspend)
+			continue;
+
+		__clocksource_suspend_select(cs);
+	}
+}
+
+/**
+ * clocksource_start_suspend_timing - Start measuring the suspend timing
+ * @cs:			current clocksource from timekeeping
+ * @start_cycles:	current cycles from timekeeping
+ *
+ * This function will save the start cycle values of suspend timer to calculate
+ * the suspend time when resuming system.
+ *
+ * This function is called late in the suspend process from timekeeping_suspend(),
+ * that means processes are freezed, non-boot cpus and interrupts are disabled
+ * now. It is therefore possible to start the suspend timer without taking the
+ * clocksource mutex.
+ */
+void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles)
+{
+	if (!suspend_clocksource)
+		return;
+
+	/*
+	 * If current clocksource is the suspend timer, we should use the
+	 * tkr_mono.cycle_last value as suspend_start to avoid same reading
+	 * from suspend timer.
+	 */
+	if (clocksource_is_suspend(cs)) {
+		suspend_start = start_cycles;
+		return;
+	}
+
+	if (suspend_clocksource->enable &&
+	    suspend_clocksource->enable(suspend_clocksource)) {
+		pr_warn_once("Failed to enable the non-suspend-able clocksource.\n");
+		return;
+	}
+
+	suspend_start = suspend_clocksource->read(suspend_clocksource);
+}
+
+/**
+ * clocksource_stop_suspend_timing - Stop measuring the suspend timing
+ * @cs:		current clocksource from timekeeping
+ * @cycle_now:	current cycles from timekeeping
+ *
+ * This function will calculate the suspend time from suspend timer.
+ *
+ * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource.
+ *
+ * This function is called early in the resume process from timekeeping_resume(),
+ * that means there is only one cpu, no processes are running and the interrupts
+ * are disabled. It is therefore possible to stop the suspend timer without
+ * taking the clocksource mutex.
+ */
+u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now)
+{
+	u64 now, delta, nsec = 0;
+
+	if (!suspend_clocksource)
+		return 0;
+
+	/*
+	 * If current clocksource is the suspend timer, we should use the
+	 * tkr_mono.cycle_last value from timekeeping as current cycle to
+	 * avoid same reading from suspend timer.
+	 */
+	if (clocksource_is_suspend(cs))
+		now = cycle_now;
+	else
+		now = suspend_clocksource->read(suspend_clocksource);
+
+	if (now > suspend_start) {
+		delta = clocksource_delta(now, suspend_start,
+					  suspend_clocksource->mask);
+		nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult,
+				       suspend_clocksource->shift);
+	}
+
+	/*
+	 * Disable the suspend timer to save power if current clocksource is
+	 * not the suspend timer.
+	 */
+	if (!clocksource_is_suspend(cs) && suspend_clocksource->disable)
+		suspend_clocksource->disable(suspend_clocksource);
+
+	return nsec;
+}
+
 /**
  * clocksource_suspend - suspend the clocksource(s)
  */
@@ -792,6 +930,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
 	clocksource_select();
 	clocksource_select_watchdog(false);
+	__clocksource_suspend_select(cs);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -820,6 +959,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 
 	clocksource_select();
 	clocksource_select_watchdog(false);
+	clocksource_suspend_select(false);
 	mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_change_rating);
@@ -845,6 +985,15 @@ static int clocksource_unbind(struct clocksource *cs)
 			return -EBUSY;
 	}
 
+	if (clocksource_is_suspend(cs)) {
+		/*
+		 * Select and try to install a replacement suspend clocksource.
+		 * If no replacement suspend clocksource, we will just let the
+		 * clocksource go and have no suspend clocksource.
+		 */
+		clocksource_suspend_select(true);
+	}
+
 	clocksource_watchdog_lock(&flags);
 	clocksource_dequeue_watchdog(cs);
 	list_del_init(&cs->list);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 19414b16cf9e..d9e659a12c76 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1692,7 +1692,7 @@ void timekeeping_resume(void)
 	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
-	u64 cycle_now;
+	u64 cycle_now, nsec;
 	bool inject_sleeptime = false;
 
 	read_persistent_clock64(&ts_new);
@@ -1716,13 +1716,8 @@ void timekeeping_resume(void)
 	 * usable source. The rtc part is handled separately in rtc core code.
 	 */
 	cycle_now = tk_clock_read(&tk->tkr_mono);
-	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-		cycle_now > tk->tkr_mono.cycle_last) {
-		u64 nsec, cyc_delta;
-
-		cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
-					      tk->tkr_mono.mask);
-		nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift);
+	nsec = clocksource_stop_suspend_timing(clock, cycle_now);
+	if (nsec > 0) {
 		ts_delta = ns_to_timespec64(nsec);
 		inject_sleeptime = true;
 	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
@@ -1757,6 +1752,8 @@ int timekeeping_suspend(void)
 	unsigned long flags;
 	struct timespec64		delta, delta_delta;
 	static struct timespec64	old_delta;
+	struct clocksource *curr_clock;
+	u64 cycle_now;
 
 	read_persistent_clock64(&timekeeping_suspend_time);
 
@@ -1775,6 +1772,15 @@ int timekeeping_suspend(void)
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
+	/*
+	 * Since we've called forward_now, cycle_last stores the value
+	 * just read from the current clocksource. Save this to potentially
+	 * use in suspend timing.
+	 */
+	curr_clock = tk->tkr_mono.clock;
+	cycle_now = tk->tkr_mono.cycle_last;
+	clocksource_start_suspend_timing(curr_clock, cycle_now);
+
 	if (persistent_clock_exists) {
 		/*
 		 * To avoid drift caused by repeated suspend/resumes,
-- 
cgit v1.2.3


From b51dab46c6adfbb7e80cd0f59ae17b8a30d94b1a Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Wed, 18 Jul 2018 06:27:22 -0700
Subject: qed: Add qed APIs for PHY module query.

This patch adds qed APIs for reading the PHY module.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h  | 16 ++++++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 23 ++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 49 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 16 ++++++++++
 include/linux/qed/qed_if.h                 | 15 +++++++++
 5 files changed, 119 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index bee10c1781fb..8faceb691657 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -12444,6 +12444,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_STATS_TYPE_ISCSI           3
 #define DRV_MSG_CODE_STATS_TYPE_RDMA            4
 
+#define DRV_MSG_CODE_TRANSCEIVER_READ           0x00160000
+
 #define DRV_MSG_CODE_MASK_PARITIES              0x001a0000
 
 #define DRV_MSG_CODE_BIST_TEST			0x001e0000
@@ -12543,6 +12545,15 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_SET_LED_MODE_ON		0x1
 #define DRV_MB_PARAM_SET_LED_MODE_OFF		0x2
 
+#define DRV_MB_PARAM_TRANSCEIVER_PORT_OFFSET		0
+#define DRV_MB_PARAM_TRANSCEIVER_PORT_MASK		0x00000003
+#define DRV_MB_PARAM_TRANSCEIVER_SIZE_OFFSET		2
+#define DRV_MB_PARAM_TRANSCEIVER_SIZE_MASK		0x000000FC
+#define DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_OFFSET	8
+#define DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK	0x0000FF00
+#define DRV_MB_PARAM_TRANSCEIVER_OFFSET_OFFSET		16
+#define DRV_MB_PARAM_TRANSCEIVER_OFFSET_MASK		0xFFFF0000
+
 	/* Resource Allocation params - Driver version support */
 #define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
 #define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
@@ -12596,6 +12607,9 @@ struct public_drv_mb {
 #define FW_MSG_CODE_PHY_OK			0x00110000
 #define FW_MSG_CODE_OK				0x00160000
 #define FW_MSG_CODE_ERROR			0x00170000
+#define FW_MSG_CODE_TRANSCEIVER_DIAG_OK		0x00160000
+#define FW_MSG_CODE_TRANSCEIVER_DIAG_ERROR	0x00170000
+#define FW_MSG_CODE_TRANSCEIVER_NOT_PRESENT	0x00020000
 
 #define FW_MSG_CODE_OS_WOL_SUPPORTED            0x00800000
 #define FW_MSG_CODE_OS_WOL_NOT_SUPPORTED        0x00810000
@@ -12687,6 +12701,8 @@ struct mcp_public_data {
 	struct public_func func[MCP_GLOB_FUNC_MAX];
 };
 
+#define MAX_I2C_TRANSACTION_SIZE	16
+
 /* OCBB definitions */
 enum tlvs {
 	/* Category 1: Device Properties */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 0cbc74d6ca8b..158944aa6097 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -2102,6 +2102,28 @@ out:
 	return status;
 }
 
+static int qed_read_module_eeprom(struct qed_dev *cdev, char *buf,
+				  u8 dev_addr, u32 offset, u32 len)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int rc = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	rc = qed_mcp_phy_sfp_read(hwfn, ptt, MFW_PORT(hwfn), dev_addr,
+				  offset, len, buf);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
 static struct qed_selftest_ops qed_selftest_ops_pass = {
 	.selftest_memory = &qed_selftest_memory,
 	.selftest_interrupt = &qed_selftest_interrupt,
@@ -2144,6 +2166,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.update_mac = &qed_update_mac,
 	.update_mtu = &qed_update_mtu,
 	.update_wol = &qed_update_wol,
+	.read_module_eeprom = &qed_read_module_eeprom,
 };
 
 void qed_get_protocol_stats(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 4e0b443c9519..62a220fce6e1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2463,6 +2463,55 @@ out:
 	return rc;
 }
 
+int qed_mcp_phy_sfp_read(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			 u32 port, u32 addr, u32 offset, u32 len, u8 *p_buf)
+{
+	u32 bytes_left, bytes_to_copy, buf_size, nvm_offset = 0;
+	u32 resp, param;
+	int rc;
+
+	nvm_offset |= (port << DRV_MB_PARAM_TRANSCEIVER_PORT_OFFSET) &
+		       DRV_MB_PARAM_TRANSCEIVER_PORT_MASK;
+	nvm_offset |= (addr << DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_OFFSET) &
+		       DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK;
+
+	addr = offset;
+	offset = 0;
+	bytes_left = len;
+	while (bytes_left > 0) {
+		bytes_to_copy = min_t(u32, bytes_left,
+				      MAX_I2C_TRANSACTION_SIZE);
+		nvm_offset &= (DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK |
+			       DRV_MB_PARAM_TRANSCEIVER_PORT_MASK);
+		nvm_offset |= ((addr + offset) <<
+			       DRV_MB_PARAM_TRANSCEIVER_OFFSET_OFFSET) &
+			       DRV_MB_PARAM_TRANSCEIVER_OFFSET_MASK;
+		nvm_offset |= (bytes_to_copy <<
+			       DRV_MB_PARAM_TRANSCEIVER_SIZE_OFFSET) &
+			       DRV_MB_PARAM_TRANSCEIVER_SIZE_MASK;
+		rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+					DRV_MSG_CODE_TRANSCEIVER_READ,
+					nvm_offset, &resp, &param, &buf_size,
+					(u32 *)(p_buf + offset));
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to send a transceiver read command to the MFW. rc = %d.\n",
+				  rc);
+			return rc;
+		}
+
+		if (resp == FW_MSG_CODE_TRANSCEIVER_NOT_PRESENT)
+			return -ENODEV;
+		else if (resp != FW_MSG_CODE_TRANSCEIVER_DIAG_OK)
+			return -EINVAL;
+
+		offset += buf_size;
+		bytes_left -= buf_size;
+	}
+
+	return 0;
+}
+
 int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	u32 drv_mb_param = 0, rsp, param;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 632a838f1fe3..047976d5c6e9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -839,6 +839,22 @@ int qed_mcp_nvm_rd_cmd(struct qed_hwfn *p_hwfn,
 		       u32 *o_mcp_resp,
 		       u32 *o_mcp_param, u32 *o_txn_size, u32 *o_buf);
 
+/**
+ * @brief Read from sfp
+ *
+ *  @param p_hwfn - hw function
+ *  @param p_ptt  - PTT required for register access
+ *  @param port   - transceiver port
+ *  @param addr   - I2C address
+ *  @param offset - offset in sfp
+ *  @param len    - buffer length
+ *  @param p_buf  - buffer to read into
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_phy_sfp_read(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			 u32 port, u32 addr, u32 offset, u32 len, u8 *p_buf);
+
 /**
  * @brief indicates whether the MFW objects [under mcp_info] are accessible
  *
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b4040023cbfb..8cd34645e892 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -759,6 +759,9 @@ struct qed_generic_tlvs {
 	u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN];
 };
 
+#define QED_I2C_DEV_ADDR_A0 0xA0
+#define QED_I2C_DEV_ADDR_A2 0xA2
+
 #define QED_NVM_SIGNATURE 0x12435687
 
 enum qed_nvm_flash_cmd {
@@ -1026,6 +1029,18 @@ struct qed_common_ops {
  * @param enabled - true iff WoL should be enabled.
  */
 	int (*update_wol) (struct qed_dev *cdev, bool enabled);
+
+/**
+ * @brief read_module_eeprom
+ *
+ * @param cdev
+ * @param buf - buffer
+ * @param dev_addr - PHY device memory region
+ * @param offset - offset into eeprom contents to be read
+ * @param len - buffer length, i.e., max bytes to be read
+ */
+	int (*read_module_eeprom)(struct qed_dev *cdev,
+				  char *buf, u8 dev_addr, u32 offset, u32 len);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From eef5ba1aa148ca5e6deb1e0aa1de797fa4e12cb7 Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Wed, 20 Jun 2018 10:51:53 +0200
Subject: i2c: smbus: add unlocked __i2c_smbus_xfer variant

Removes all locking from i2c_smbus_xfer and renames it to __i2c_smbus_xfer,
then adds a new i2c_smbus_xfer function that simply grabs the lock while
calling the unlocked variant.

This is not perfectly equivalent, since i2c_smbus_xfer was callable from
atomic/irq context if you happened to end up emulating SMBus with an I2C
transfer, and that is no longer the case with this patch. It is unknown
(to me) if anything depends on that quirk, but it seems fragile enough to
simply break those cases and require them to call i2c_transfer directly
instead.

While at it, for consistency rename the 2nd to last argument (size) of
the i2c_smbus_xfer declaration to protocol and remove the surplus extern
marker.

Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-smbus.c | 28 ++++++++++++++++++++--------
 include/linux/i2c.h          | 11 ++++++++---
 2 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c
index 51970bae3c4a..9cd66cabb84f 100644
--- a/drivers/i2c/i2c-core-smbus.c
+++ b/drivers/i2c/i2c-core-smbus.c
@@ -463,7 +463,7 @@ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter *adapter, u16 addr,
 			msg[num-1].len++;
 	}
 
-	status = i2c_transfer(adapter, msg, num);
+	status = __i2c_transfer(adapter, msg, num);
 	if (status < 0)
 		goto cleanup;
 	if (status != num) {
@@ -524,9 +524,24 @@ cleanup:
  * This executes an SMBus protocol operation, and returns a negative
  * errno code else zero on success.
  */
-s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
-		   char read_write, u8 command, int protocol,
-		   union i2c_smbus_data *data)
+s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+		   unsigned short flags, char read_write,
+		   u8 command, int protocol, union i2c_smbus_data *data)
+{
+	s32 res;
+
+	i2c_lock_bus(adapter, I2C_LOCK_SEGMENT);
+	res = __i2c_smbus_xfer(adapter, addr, flags, read_write,
+			       command, protocol, data);
+	i2c_unlock_bus(adapter, I2C_LOCK_SEGMENT);
+
+	return res;
+}
+EXPORT_SYMBOL(i2c_smbus_xfer);
+
+s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+		     unsigned short flags, char read_write,
+		     u8 command, int protocol, union i2c_smbus_data *data)
 {
 	unsigned long orig_jiffies;
 	int try;
@@ -543,8 +558,6 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC | I2C_CLIENT_SCCB;
 
 	if (adapter->algo->smbus_xfer) {
-		i2c_lock_bus(adapter, I2C_LOCK_SEGMENT);
-
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
 		for (res = 0, try = 0; try <= adapter->retries; try++) {
@@ -557,7 +570,6 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		i2c_unlock_bus(adapter, I2C_LOCK_SEGMENT);
 
 		if (res != -EOPNOTSUPP || !adapter->algo->master_xfer)
 			goto trace;
@@ -579,7 +591,7 @@ trace:
 
 	return res;
 }
-EXPORT_SYMBOL(i2c_smbus_xfer);
+EXPORT_SYMBOL(__i2c_smbus_xfer);
 
 /**
  * i2c_smbus_read_i2c_block_data_or_emulated - read block or emulate
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 254cd34eeae2..465afb092fa7 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -140,9 +140,14 @@ extern int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
    and probably just as fast.
    Note that we use i2c_adapter here, because you do not need a specific
    smbus adapter to call this function. */
-extern s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
-			  unsigned short flags, char read_write, u8 command,
-			  int size, union i2c_smbus_data *data);
+s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+		   unsigned short flags, char read_write, u8 command,
+		   int protocol, union i2c_smbus_data *data);
+
+/* Unlocked flavor */
+s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+		     unsigned short flags, char read_write, u8 command,
+		     int protocol, union i2c_smbus_data *data);
 
 /* Now follow the 'nice' access routines. These also document the calling
    conventions of i2c_smbus_xfer. */
-- 
cgit v1.2.3


From 488dee96bb62f0b3d9e678cf42574034d5b033a5 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 20 Jul 2018 21:56:47 +0000
Subject: kernfs: allow creating kernfs objects with arbitrary uid/gid

This change allows creating kernfs files and directories with arbitrary
uid/gid instead of always using GLOBAL_ROOT_UID/GID by extending
kernfs_create_dir_ns() and kernfs_create_file_ns() with uid/gid arguments.
The "simple" kernfs_create_file() and kernfs_create_dir() are left alone
and always create objects belonging to the global root.

When creating symlinks ownership (uid/gid) is taken from the target kernfs
object.

Co-Developed-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  4 +++-
 fs/kernfs/dir.c                          | 29 ++++++++++++++++++++++++++---
 fs/kernfs/file.c                         |  8 ++++++--
 fs/kernfs/inode.c                        |  2 +-
 fs/kernfs/kernfs-internal.h              |  2 ++
 fs/kernfs/symlink.c                      | 11 ++++++++++-
 fs/sysfs/dir.c                           |  4 +++-
 fs/sysfs/file.c                          |  5 +++--
 include/linux/kernfs.h                   | 28 +++++++++++++++++++---------
 kernel/cgroup/cgroup.c                   |  4 +++-
 10 files changed, 76 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 749856a2e736..9af1a21265d3 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -146,6 +146,7 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 	int ret;
 
 	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 				  0, rft->kf_ops, rft, NULL, NULL);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
@@ -1503,7 +1504,8 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
 	struct kernfs_node *kn;
 	int ret = 0;
 
-	kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+	kn = __kernfs_create_file(parent_kn, name, 0444,
+				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
 				  &kf_mondata_ops, priv, NULL, NULL);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d66cc0777303..4ca0b5c18192 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -619,6 +619,7 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
 
 static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 					     const char *name, umode_t mode,
+					     kuid_t uid, kgid_t gid,
 					     unsigned flags)
 {
 	struct kernfs_node *kn;
@@ -661,8 +662,22 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	kn->mode = mode;
 	kn->flags = flags;
 
+	if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
+		struct iattr iattr = {
+			.ia_valid = ATTR_UID | ATTR_GID,
+			.ia_uid = uid,
+			.ia_gid = gid,
+		};
+
+		ret = __kernfs_setattr(kn, &iattr);
+		if (ret < 0)
+			goto err_out3;
+	}
+
 	return kn;
 
+ err_out3:
+	idr_remove(&root->ino_idr, kn->id.ino);
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
@@ -672,11 +687,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 
 struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
 				    const char *name, umode_t mode,
+				    kuid_t uid, kgid_t gid,
 				    unsigned flags)
 {
 	struct kernfs_node *kn;
 
-	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+	kn = __kernfs_new_node(kernfs_root(parent),
+			       name, mode, uid, gid, flags);
 	if (kn) {
 		kernfs_get(parent);
 		kn->parent = parent;
@@ -946,6 +963,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 	root->next_generation = 1;
 
 	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+			       GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 			       KERNFS_DIR);
 	if (!kn) {
 		idr_destroy(&root->ino_idr);
@@ -984,6 +1002,8 @@ void kernfs_destroy_root(struct kernfs_root *root)
  * @parent: parent in which to create a new directory
  * @name: name of the new directory
  * @mode: mode of the new directory
+ * @uid: uid of the new directory
+ * @gid: gid of the new directory
  * @priv: opaque data associated with the new directory
  * @ns: optional namespace tag of the directory
  *
@@ -991,13 +1011,15 @@ void kernfs_destroy_root(struct kernfs_root *root)
  */
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
+					 kuid_t uid, kgid_t gid,
 					 void *priv, const void *ns)
 {
 	struct kernfs_node *kn;
 	int rc;
 
 	/* allocate */
-	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+	kn = kernfs_new_node(parent, name, mode | S_IFDIR,
+			     uid, gid, KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
@@ -1028,7 +1050,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
 	int rc;
 
 	/* allocate */
-	kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+	kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
+			     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 2015d8c45e4a..dbf5bc250bfd 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -965,6 +965,8 @@ const struct file_operations kernfs_file_fops = {
  * @parent: directory to create the file in
  * @name: name of the file
  * @mode: mode of the file
+ * @uid: uid of the file
+ * @gid: gid of the file
  * @size: size of the file
  * @ops: kernfs operations for the file
  * @priv: private data for the file
@@ -975,7 +977,8 @@ const struct file_operations kernfs_file_fops = {
  */
 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 const char *name,
-					 umode_t mode, loff_t size,
+					 umode_t mode, kuid_t uid, kgid_t gid,
+					 loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
 					 struct lock_class_key *key)
@@ -986,7 +989,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 
 	flags = KERNFS_FILE;
 
-	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
+			     uid, gid, flags);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d73fe9d56e2..80cebcd94c90 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -63,7 +63,7 @@ out_unlock:
 	return ret;
 }
 
-static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	struct kernfs_iattrs *attrs;
 	struct iattr *iattrs;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 0f260dcca177..3d83b114bb08 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -90,6 +90,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
 int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
 
 /*
  * dir.c
@@ -104,6 +105,7 @@ void kernfs_put_active(struct kernfs_node *kn);
 int kernfs_add_one(struct kernfs_node *kn);
 struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
 				    const char *name, umode_t mode,
+				    kuid_t uid, kgid_t gid,
 				    unsigned flags);
 struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root,
 						    unsigned int ino);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 08ccabd7047f..5ffed48f3d0e 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -21,6 +21,7 @@
  * @target: target node for the symlink to point to
  *
  * Returns the created node on success, ERR_PTR() value on error.
+ * Ownership of the link matches ownership of the target.
  */
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
@@ -28,8 +29,16 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 {
 	struct kernfs_node *kn;
 	int error;
+	kuid_t uid = GLOBAL_ROOT_UID;
+	kgid_t gid = GLOBAL_ROOT_GID;
 
-	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+	if (target->iattr) {
+		uid = target->iattr->ia_iattr.ia_uid;
+		gid = target->iattr->ia_iattr.ia_gid;
+	}
+
+	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid,
+			     KERNFS_LINK);
 	if (!kn)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 58eba92a0e41..e39b884f0867 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -52,7 +52,9 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 		return -ENOENT;
 
 	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
-				  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
+				  S_IRWXU | S_IRUGO | S_IXUGO,
+				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				  kobj, ns);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, kobject_name(kobj));
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5c13f29bfcdb..513fa691ecbd 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -302,8 +302,9 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 	if (!attr->ignore_lockdep)
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
-	kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
-				  (void *)attr, ns, key);
+	kn = __kernfs_create_file(parent, attr->name,
+				  mode & 0777, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				  size, ops, (void *)attr, ns, key);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
 			sysfs_warn_dup(parent, attr->name);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index ab25c8b6d9e3..814643f7ee52 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -15,6 +15,7 @@
 #include <linux/lockdep.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
+#include <linux/uidgid.h>
 #include <linux/wait.h>
 
 struct file;
@@ -325,12 +326,14 @@ void kernfs_destroy_root(struct kernfs_root *root);
 
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
+					 kuid_t uid, kgid_t gid,
 					 void *priv, const void *ns);
 struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
 					    const char *name);
 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
-					 const char *name,
-					 umode_t mode, loff_t size,
+					 const char *name, umode_t mode,
+					 kuid_t uid, kgid_t gid,
+					 loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
 					 struct lock_class_key *key);
@@ -415,12 +418,14 @@ static inline void kernfs_destroy_root(struct kernfs_root *root) { }
 
 static inline struct kernfs_node *
 kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
-		     umode_t mode, void *priv, const void *ns)
+		     umode_t mode, kuid_t uid, kgid_t gid,
+		     void *priv, const void *ns)
 { return ERR_PTR(-ENOSYS); }
 
 static inline struct kernfs_node *
 __kernfs_create_file(struct kernfs_node *parent, const char *name,
-		     umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		     umode_t mode, kuid_t uid, kgid_t gid,
+		     loff_t size, const struct kernfs_ops *ops,
 		     void *priv, const void *ns, struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 
@@ -498,12 +503,15 @@ static inline struct kernfs_node *
 kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
 		  void *priv)
 {
-	return kernfs_create_dir_ns(parent, name, mode, priv, NULL);
+	return kernfs_create_dir_ns(parent, name, mode,
+				    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				    priv, NULL);
 }
 
 static inline struct kernfs_node *
 kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
-		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		      umode_t mode, kuid_t uid, kgid_t gid,
+		      loff_t size, const struct kernfs_ops *ops,
 		      void *priv, const void *ns)
 {
 	struct lock_class_key *key = NULL;
@@ -511,15 +519,17 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = (struct lock_class_key *)&ops->lockdep_key;
 #endif
-	return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
-				    key);
+	return __kernfs_create_file(parent, name, mode, uid, gid,
+				    size, ops, priv, ns, key);
 }
 
 static inline struct kernfs_node *
 kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
 		   loff_t size, const struct kernfs_ops *ops, void *priv)
 {
-	return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
+	return kernfs_create_file_ns(parent, name, mode,
+				     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				     size, ops, priv, NULL);
 }
 
 static inline int kernfs_remove_by_name(struct kernfs_node *parent,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 077370bf8964..35cf3d71f8aa 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3557,7 +3557,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 	key = &cft->lockdep_key;
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
-				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+				  cgroup_file_mode(cft),
+				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				  0, cft->kf_ops, cft,
 				  NULL, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
-- 
cgit v1.2.3


From 5f81880d5204ee2388fd9a75bb850ccd526885b7 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 20 Jul 2018 21:56:48 +0000
Subject: sysfs, kobject: allow creating kobject belonging to arbitrary users

Normally kobjects and their sysfs representation belong to global root,
however it is not necessarily the case for objects in separate namespaces.
For example, objects in separate network namespace logically belong to the
container's root and not global root.

This change lays groundwork for allowing network namespace objects
ownership to be transferred to container's root user by defining
get_ownership() callback in ktype structure and using it in sysfs code to
retrieve desired uid/gid when creating sysfs objects for given kobject.

Co-Developed-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/dir.c          |  7 +++++--
 fs/sysfs/file.c         | 32 ++++++++++++++++++++------------
 fs/sysfs/group.c        | 23 +++++++++++++++++------
 fs/sysfs/sysfs.h        |  5 ++---
 include/linux/kobject.h |  4 ++++
 lib/kobject.c           | 19 +++++++++++++++++++
 6 files changed, 67 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e39b884f0867..feeae8081c22 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -40,6 +40,8 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
 	struct kernfs_node *parent, *kn;
+	kuid_t uid;
+	kgid_t gid;
 
 	BUG_ON(!kobj);
 
@@ -51,9 +53,10 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 	if (!parent)
 		return -ENOENT;
 
+	kobject_get_ownership(kobj, &uid, &gid);
+
 	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
-				  S_IRWXU | S_IRUGO | S_IXUGO,
-				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				  S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
 				  kobj, ns);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 513fa691ecbd..fa46216523cf 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -245,7 +245,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
 
 int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 			   const struct attribute *attr, bool is_bin,
-			   umode_t mode, const void *ns)
+			   umode_t mode, kuid_t uid, kgid_t gid, const void *ns)
 {
 	struct lock_class_key *key = NULL;
 	const struct kernfs_ops *ops;
@@ -302,8 +302,8 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 	if (!attr->ignore_lockdep)
 		key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
-	kn = __kernfs_create_file(parent, attr->name,
-				  mode & 0777, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+
+	kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
 				  size, ops, (void *)attr, ns, key);
 	if (IS_ERR(kn)) {
 		if (PTR_ERR(kn) == -EEXIST)
@@ -313,12 +313,6 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 	return 0;
 }
 
-int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
-		   bool is_bin)
-{
-	return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
-}
-
 /**
  * sysfs_create_file_ns - create an attribute file for an object with custom ns
  * @kobj: object we're creating for
@@ -328,9 +322,14 @@ int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
 int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 			 const void *ns)
 {
+	kuid_t uid;
+	kgid_t gid;
+
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
+	kobject_get_ownership(kobj, &uid, &gid);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
+				      uid, gid, ns);
 
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -359,6 +358,8 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
 	struct kernfs_node *parent;
+	kuid_t uid;
+	kgid_t gid;
 	int error;
 
 	if (group) {
@@ -371,7 +372,9 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_add_file(parent, attr, false);
+	kobject_get_ownership(kobj, &uid, &gid);
+	error = sysfs_add_file_mode_ns(kobj->sd, attr, false,
+				       attr->mode, uid, gid, NULL);
 	kernfs_put(parent);
 
 	return error;
@@ -487,9 +490,14 @@ EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
 int sysfs_create_bin_file(struct kobject *kobj,
 			  const struct bin_attribute *attr)
 {
+	kuid_t uid;
+	kgid_t gid;
+
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, &attr->attr, true);
+	kobject_get_ownership(kobj, &uid, &gid);
+	return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
+				      attr->attr.mode, uid, gid, NULL);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 4802ec0e1e3a..c7a716c4acc9 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -31,6 +31,7 @@ static void remove_files(struct kernfs_node *parent,
 }
 
 static int create_files(struct kernfs_node *parent, struct kobject *kobj,
+			kuid_t uid, kgid_t gid,
 			const struct attribute_group *grp, int update)
 {
 	struct attribute *const *attr;
@@ -60,7 +61,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 
 			mode &= SYSFS_PREALLOC | 0664;
 			error = sysfs_add_file_mode_ns(parent, *attr, false,
-						       mode, NULL);
+						       mode, uid, gid, NULL);
 			if (unlikely(error))
 				break;
 		}
@@ -90,7 +91,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			mode &= SYSFS_PREALLOC | 0664;
 			error = sysfs_add_file_mode_ns(parent,
 					&(*bin_attr)->attr, true,
-					mode, NULL);
+					mode,
+					uid, gid, NULL);
 			if (error)
 				break;
 		}
@@ -106,6 +108,8 @@ static int internal_create_group(struct kobject *kobj, int update,
 				 const struct attribute_group *grp)
 {
 	struct kernfs_node *kn;
+	kuid_t uid;
+	kgid_t gid;
 	int error;
 
 	BUG_ON(!kobj || (!update && !kobj->sd));
@@ -118,9 +122,11 @@ static int internal_create_group(struct kobject *kobj, int update,
 			kobj->name, grp->name ?: "");
 		return -EINVAL;
 	}
+	kobject_get_ownership(kobj, &uid, &gid);
 	if (grp->name) {
-		kn = kernfs_create_dir(kobj->sd, grp->name,
-				       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
+		kn = kernfs_create_dir_ns(kobj->sd, grp->name,
+					  S_IRWXU | S_IRUGO | S_IXUGO,
+					  uid, gid, kobj, NULL);
 		if (IS_ERR(kn)) {
 			if (PTR_ERR(kn) == -EEXIST)
 				sysfs_warn_dup(kobj->sd, grp->name);
@@ -129,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update,
 	} else
 		kn = kobj->sd;
 	kernfs_get(kn);
-	error = create_files(kn, kobj, grp, update);
+	error = create_files(kn, kobj, uid, gid, grp, update);
 	if (error) {
 		if (grp->name)
 			kernfs_remove(kn);
@@ -281,6 +287,8 @@ int sysfs_merge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
 	struct kernfs_node *parent;
+	kuid_t uid;
+	kgid_t gid;
 	int error = 0;
 	struct attribute *const *attr;
 	int i;
@@ -289,8 +297,11 @@ int sysfs_merge_group(struct kobject *kobj,
 	if (!parent)
 		return -ENOENT;
 
+	kobject_get_ownership(kobj, &uid, &gid);
+
 	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-		error = sysfs_add_file(parent, *attr, false);
+		error = sysfs_add_file_mode_ns(parent, *attr, false,
+					       (*attr)->mode, uid, gid, NULL);
 	if (error) {
 		while (--i >= 0)
 			kernfs_remove_by_name(parent, (*--attr)->name);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d098e015fcc9..0050cc0c0236 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -27,11 +27,10 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
 /*
  * file.c
  */
-int sysfs_add_file(struct kernfs_node *parent,
-		   const struct attribute *attr, bool is_bin);
 int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 			   const struct attribute *attr, bool is_bin,
-			   umode_t amode, const void *ns);
+			   umode_t amode, kuid_t uid, kgid_t gid,
+			   const void *ns);
 
 /*
  * symlink.c
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 7f6f93c3df9c..b49ff230beba 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
+#include <linux/uidgid.h>
 
 #define UEVENT_HELPER_PATH_LEN		256
 #define UEVENT_NUM_ENVP			32	/* number of env pointers */
@@ -114,6 +115,8 @@ extern struct kobject * __must_check kobject_get_unless_zero(
 extern void kobject_put(struct kobject *kobj);
 
 extern const void *kobject_namespace(struct kobject *kobj);
+extern void kobject_get_ownership(struct kobject *kobj,
+				  kuid_t *uid, kgid_t *gid);
 extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
 
 struct kobj_type {
@@ -122,6 +125,7 @@ struct kobj_type {
 	struct attribute **default_attrs;
 	const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
 	const void *(*namespace)(struct kobject *kobj);
+	void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid);
 };
 
 struct kobj_uevent_env {
diff --git a/lib/kobject.c b/lib/kobject.c
index 18989b5b3b56..f2dc1f756007 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -35,6 +35,25 @@ const void *kobject_namespace(struct kobject *kobj)
 	return kobj->ktype->namespace(kobj);
 }
 
+/**
+ * kobject_get_ownership - get sysfs ownership data for @kobj
+ * @kobj: kobject in question
+ * @uid: kernel user ID for sysfs objects
+ * @gid: kernel group ID for sysfs objects
+ *
+ * Returns initial uid/gid pair that should be used when creating sysfs
+ * representation of given kobject. Normally used to adjust ownership of
+ * objects in a container.
+ */
+void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+{
+	*uid = GLOBAL_ROOT_UID;
+	*gid = GLOBAL_ROOT_GID;
+
+	if (kobj->ktype->get_ownership)
+		kobj->ktype->get_ownership(kobj, uid, gid);
+}
+
 /*
  * populate_dir - populate directory with attributes.
  * @kobj: object we're working on.
-- 
cgit v1.2.3


From 9944e894c1266dc8515c82d1ff752d681215526b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 20 Jul 2018 21:56:50 +0000
Subject: driver core: set up ownership of class devices in sysfs

Plumb in get_ownership() callback for devices belonging to a class so that
they can be created with uid/gid different from global root. This will
allow network devices in a container to belong to container's root and not
global root.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/core.c    | 9 +++++++++
 include/linux/device.h | 5 +++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index df3e1a44707a..276c7e3f754c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -866,10 +866,19 @@ static const void *device_namespace(struct kobject *kobj)
 	return ns;
 }
 
+static void device_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+{
+	struct device *dev = kobj_to_dev(kobj);
+
+	if (dev->class && dev->class->get_ownership)
+		dev->class->get_ownership(dev, uid, gid);
+}
+
 static struct kobj_type device_ktype = {
 	.release	= device_release,
 	.sysfs_ops	= &dev_sysfs_ops,
 	.namespace	= device_namespace,
+	.get_ownership	= device_get_ownership,
 };
 
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 055a69dbcd18..fe6ccb6dc119 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -384,6 +384,9 @@ int subsys_virtual_register(struct bus_type *subsys,
  * @shutdown_pre: Called at shut-down time before driver shutdown.
  * @ns_type:	Callbacks so sysfs can detemine namespaces.
  * @namespace:	Namespace of the device belongs to this class.
+ * @get_ownership: Allows class to specify uid/gid of the sysfs directories
+ *		for the devices belonging to the class. Usually tied to
+ *		device's namespace.
  * @pm:		The default device power management operations of this class.
  * @p:		The private data of the driver core, no one other than the
  *		driver core can touch this.
@@ -413,6 +416,8 @@ struct class {
 	const struct kobj_ns_type_operations *ns_type;
 	const void *(*namespace)(struct device *dev);
 
+	void (*get_ownership)(struct device *dev, kuid_t *uid, kgid_t *gid);
+
 	const struct dev_pm_ops *pm;
 
 	struct subsys_private *p;
-- 
cgit v1.2.3


From a3134fb09e0bc5bee76e13bf863173b86f21cf87 Mon Sep 17 00:00:00 2001
From: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Date: Wed, 23 May 2018 17:35:21 -0700
Subject: drivers: soc: Add LLCC driver

LLCC (Last Level Cache Controller) provides additional cache memory
in the system. LLCC is partitioned into multiple slices and each
slice gets its own priority, size, ID and other config parameters.
LLCC driver programs these parameters for each slice. Clients that
are assigned to use LLCC need to get information such size & ID of the
slice they get and activate or deactivate the slice as needed. LLCC driver
provides API for the clients to perform these operations.

Signed-off-by: Channagoud Kadabi <ckadabi@codeaurora.org>
Signed-off-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Reviewed-by: Evan Green <evgreen@chromium.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/soc/qcom/Kconfig           |  17 ++
 drivers/soc/qcom/Makefile          |   2 +
 drivers/soc/qcom/llcc-sdm845.c     |  94 +++++++++++
 drivers/soc/qcom/llcc-slice.c      | 335 +++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/llcc-qcom.h | 180 ++++++++++++++++++++
 5 files changed, 628 insertions(+)
 create mode 100644 drivers/soc/qcom/llcc-sdm845.c
 create mode 100644 drivers/soc/qcom/llcc-slice.c
 create mode 100644 include/linux/soc/qcom/llcc-qcom.h

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 9dc02f390ba3..d1a41852ae7a 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -39,6 +39,23 @@ config QCOM_GSBI
           functions for connecting the underlying serial UART, SPI, and I2C
           devices to the output pins.
 
+config QCOM_LLCC
+	tristate "Qualcomm Technologies, Inc. LLCC driver"
+	depends on ARCH_QCOM
+	help
+	  Qualcomm Technologies, Inc. platform specific
+	  Last Level Cache Controller(LLCC) driver. This provides interfaces
+	  to clients that use the LLCC. Say yes here to enable LLCC slice
+	  driver.
+
+config QCOM_SDM845_LLCC
+	tristate "Qualcomm Technologies, Inc. SDM845 LLCC driver"
+	depends on QCOM_LLCC
+	help
+	  Say yes here to enable the LLCC driver for SDM845. This provides
+	  data required to configure LLCC so that clients can start using the
+	  LLCC slices.
+
 config QCOM_MDT_LOADER
 	tristate
 	select QCOM_SCM
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 19dcf957cb3a..5921454c4ddd 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_QCOM_SMP2P)	+= smp2p.o
 obj-$(CONFIG_QCOM_SMSM)	+= smsm.o
 obj-$(CONFIG_QCOM_WCNSS_CTRL) += wcnss_ctrl.o
 obj-$(CONFIG_QCOM_APR) += apr.o
+obj-$(CONFIG_QCOM_LLCC) += llcc-slice.o
+obj-$(CONFIG_QCOM_SDM845_LLCC) += llcc-sdm845.o
diff --git a/drivers/soc/qcom/llcc-sdm845.c b/drivers/soc/qcom/llcc-sdm845.c
new file mode 100644
index 000000000000..2e1e4f0a5db8
--- /dev/null
+++ b/drivers/soc/qcom/llcc-sdm845.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/soc/qcom/llcc-qcom.h>
+
+/*
+ * SCT(System Cache Table) entry contains of the following members:
+ * usecase_id: Unique id for the client's use case
+ * slice_id: llcc slice id for each client
+ * max_cap: The maximum capacity of the cache slice provided in KB
+ * priority: Priority of the client used to select victim line for replacement
+ * fixed_size: Boolean indicating if the slice has a fixed capacity
+ * bonus_ways: Bonus ways are additional ways to be used for any slice,
+ *		if client ends up using more than reserved cache ways. Bonus
+ *		ways are allocated only if they are not reserved for some
+ *		other client.
+ * res_ways: Reserved ways for the cache slice, the reserved ways cannot
+ *		be used by any other client than the one its assigned to.
+ * cache_mode: Each slice operates as a cache, this controls the mode of the
+ *             slice: normal or TCM(Tightly Coupled Memory)
+ * probe_target_ways: Determines what ways to probe for access hit. When
+ *                    configured to 1 only bonus and reserved ways are probed.
+ *                    When configured to 0 all ways in llcc are probed.
+ * dis_cap_alloc: Disable capacity based allocation for a client
+ * retain_on_pc: If this bit is set and client has maintained active vote
+ *               then the ways assigned to this client are not flushed on power
+ *               collapse.
+ * activate_on_init: Activate the slice immediately after the SCT is programmed
+ */
+#define SCT_ENTRY(uid, sid, mc, p, fs, bway, rway, cmod, ptw, dca, rp, a) \
+	{					\
+		.usecase_id = uid,		\
+		.slice_id = sid,		\
+		.max_cap = mc,			\
+		.priority = p,			\
+		.fixed_size = fs,		\
+		.bonus_ways = bway,		\
+		.res_ways = rway,		\
+		.cache_mode = cmod,		\
+		.probe_target_ways = ptw,	\
+		.dis_cap_alloc = dca,		\
+		.retain_on_pc = rp,		\
+		.activate_on_init = a,		\
+	}
+
+static struct llcc_slice_config sdm845_data[] =  {
+	SCT_ENTRY(LLCC_CPUSS,    1,  2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 1),
+	SCT_ENTRY(LLCC_VIDSC0,   2,  512,  2, 1, 0x0,   0x0f0, 0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_VIDSC1,   3,  512,  2, 1, 0x0,   0x0f0, 0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_ROTATOR,  4,  563,  2, 1, 0x0,   0x00e, 2, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_VOICE,    5,  2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_AUDIO,    6,  2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_MDMHPGRW, 7,  1024, 2, 0, 0xfc,  0xf00, 0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_MDM,      8,  2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_CMPT,     10, 2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_GPUHTW,   11, 512,  1, 1, 0xc,   0x0,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_GPU,      12, 2304, 1, 0, 0xff0, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_MMUHWT,   13, 256,  2, 0, 0x0,   0x1,   0, 0, 1, 0, 1),
+	SCT_ENTRY(LLCC_CMPTDMA,  15, 2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_DISP,     16, 2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_VIDFW,    17, 2816, 1, 0, 0xffc, 0x2,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_MDMHPFX,  20, 1024, 2, 1, 0x0,   0xf00, 0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_MDMPNG,   21, 1024, 0, 1, 0x1e,  0x0,   0, 0, 1, 1, 0),
+	SCT_ENTRY(LLCC_AUDHW,    22, 1024, 1, 1, 0xffc, 0x2,   0, 0, 1, 1, 0),
+};
+
+static int sdm845_qcom_llcc_probe(struct platform_device *pdev)
+{
+	return qcom_llcc_probe(pdev, sdm845_data, ARRAY_SIZE(sdm845_data));
+}
+
+static const struct of_device_id sdm845_qcom_llcc_of_match[] = {
+	{ .compatible = "qcom,sdm845-llcc", },
+	{ }
+};
+
+static struct platform_driver sdm845_qcom_llcc_driver = {
+	.driver = {
+		.name = "sdm845-llcc",
+		.of_match_table = sdm845_qcom_llcc_of_match,
+	},
+	.probe = sdm845_qcom_llcc_probe,
+};
+module_platform_driver(sdm845_qcom_llcc_driver);
+
+MODULE_DESCRIPTION("QCOM sdm845 LLCC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/soc/qcom/llcc-slice.c b/drivers/soc/qcom/llcc-slice.c
new file mode 100644
index 000000000000..fcaad1a4b41d
--- /dev/null
+++ b/drivers/soc/qcom/llcc-slice.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
+ *
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/soc/qcom/llcc-qcom.h>
+
+#define ACTIVATE                      BIT(0)
+#define DEACTIVATE                    BIT(1)
+#define ACT_CTRL_OPCODE_ACTIVATE      BIT(0)
+#define ACT_CTRL_OPCODE_DEACTIVATE    BIT(1)
+#define ACT_CTRL_ACT_TRIG             BIT(0)
+#define ACT_CTRL_OPCODE_SHIFT         0x01
+#define ATTR1_PROBE_TARGET_WAYS_SHIFT 0x02
+#define ATTR1_FIXED_SIZE_SHIFT        0x03
+#define ATTR1_PRIORITY_SHIFT          0x04
+#define ATTR1_MAX_CAP_SHIFT           0x10
+#define ATTR0_RES_WAYS_MASK           GENMASK(11, 0)
+#define ATTR0_BONUS_WAYS_MASK         GENMASK(27, 16)
+#define ATTR0_BONUS_WAYS_SHIFT        0x10
+#define LLCC_STATUS_READ_DELAY        100
+
+#define CACHE_LINE_SIZE_SHIFT         6
+
+#define LLCC_COMMON_STATUS0           0x0003000c
+#define LLCC_LB_CNT_MASK              GENMASK(31, 28)
+#define LLCC_LB_CNT_SHIFT             28
+
+#define MAX_CAP_TO_BYTES(n)           (n * SZ_1K)
+#define LLCC_TRP_ACT_CTRLn(n)         (n * SZ_4K)
+#define LLCC_TRP_STATUSn(n)           (4 + n * SZ_4K)
+#define LLCC_TRP_ATTR0_CFGn(n)        (0x21000 + SZ_8 * n)
+#define LLCC_TRP_ATTR1_CFGn(n)        (0x21004 + SZ_8 * n)
+
+#define BANK_OFFSET_STRIDE	      0x80000
+
+static struct llcc_drv_data *drv_data;
+
+static const struct regmap_config llcc_regmap_config = {
+	.reg_bits = 32,
+	.reg_stride = 4,
+	.val_bits = 32,
+	.fast_io = true,
+};
+
+/**
+ * llcc_slice_getd - get llcc slice descriptor
+ * @uid: usecase_id for the client
+ *
+ * A pointer to llcc slice descriptor will be returned on success and
+ * and error pointer is returned on failure
+ */
+struct llcc_slice_desc *llcc_slice_getd(u32 uid)
+{
+	const struct llcc_slice_config *cfg;
+	struct llcc_slice_desc *desc;
+	u32 sz, count;
+
+	cfg = drv_data->cfg;
+	sz = drv_data->cfg_size;
+
+	for (count = 0; cfg && count < sz; count++, cfg++)
+		if (cfg->usecase_id == uid)
+			break;
+
+	if (count == sz || !cfg)
+		return ERR_PTR(-ENODEV);
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	desc->slice_id = cfg->slice_id;
+	desc->slice_size = cfg->max_cap;
+
+	return desc;
+}
+EXPORT_SYMBOL_GPL(llcc_slice_getd);
+
+/**
+ * llcc_slice_putd - llcc slice descritpor
+ * @desc: Pointer to llcc slice descriptor
+ */
+void llcc_slice_putd(struct llcc_slice_desc *desc)
+{
+	kfree(desc);
+}
+EXPORT_SYMBOL_GPL(llcc_slice_putd);
+
+static int llcc_update_act_ctrl(u32 sid,
+				u32 act_ctrl_reg_val, u32 status)
+{
+	u32 act_ctrl_reg;
+	u32 status_reg;
+	u32 slice_status;
+	int ret;
+
+	act_ctrl_reg = drv_data->bcast_off + LLCC_TRP_ACT_CTRLn(sid);
+	status_reg = drv_data->bcast_off + LLCC_TRP_STATUSn(sid);
+
+	/* Set the ACTIVE trigger */
+	act_ctrl_reg_val |= ACT_CTRL_ACT_TRIG;
+	ret = regmap_write(drv_data->regmap, act_ctrl_reg, act_ctrl_reg_val);
+	if (ret)
+		return ret;
+
+	/* Clear the ACTIVE trigger */
+	act_ctrl_reg_val &= ~ACT_CTRL_ACT_TRIG;
+	ret = regmap_write(drv_data->regmap, act_ctrl_reg, act_ctrl_reg_val);
+	if (ret)
+		return ret;
+
+	ret = regmap_read_poll_timeout(drv_data->regmap, status_reg,
+				      slice_status, !(slice_status & status),
+				      0, LLCC_STATUS_READ_DELAY);
+	return ret;
+}
+
+/**
+ * llcc_slice_activate - Activate the llcc slice
+ * @desc: Pointer to llcc slice descriptor
+ *
+ * A value of zero will be returned on success and a negative errno will
+ * be returned in error cases
+ */
+int llcc_slice_activate(struct llcc_slice_desc *desc)
+{
+	int ret;
+	u32 act_ctrl_val;
+
+	mutex_lock(&drv_data->lock);
+	if (test_bit(desc->slice_id, drv_data->bitmap)) {
+		mutex_unlock(&drv_data->lock);
+		return 0;
+	}
+
+	act_ctrl_val = ACT_CTRL_OPCODE_ACTIVATE << ACT_CTRL_OPCODE_SHIFT;
+
+	ret = llcc_update_act_ctrl(desc->slice_id, act_ctrl_val,
+				  DEACTIVATE);
+	if (ret) {
+		mutex_unlock(&drv_data->lock);
+		return ret;
+	}
+
+	__set_bit(desc->slice_id, drv_data->bitmap);
+	mutex_unlock(&drv_data->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(llcc_slice_activate);
+
+/**
+ * llcc_slice_deactivate - Deactivate the llcc slice
+ * @desc: Pointer to llcc slice descriptor
+ *
+ * A value of zero will be returned on success and a negative errno will
+ * be returned in error cases
+ */
+int llcc_slice_deactivate(struct llcc_slice_desc *desc)
+{
+	u32 act_ctrl_val;
+	int ret;
+
+	mutex_lock(&drv_data->lock);
+	if (!test_bit(desc->slice_id, drv_data->bitmap)) {
+		mutex_unlock(&drv_data->lock);
+		return 0;
+	}
+	act_ctrl_val = ACT_CTRL_OPCODE_DEACTIVATE << ACT_CTRL_OPCODE_SHIFT;
+
+	ret = llcc_update_act_ctrl(desc->slice_id, act_ctrl_val,
+				  ACTIVATE);
+	if (ret) {
+		mutex_unlock(&drv_data->lock);
+		return ret;
+	}
+
+	__clear_bit(desc->slice_id, drv_data->bitmap);
+	mutex_unlock(&drv_data->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(llcc_slice_deactivate);
+
+/**
+ * llcc_get_slice_id - return the slice id
+ * @desc: Pointer to llcc slice descriptor
+ */
+int llcc_get_slice_id(struct llcc_slice_desc *desc)
+{
+	return desc->slice_id;
+}
+EXPORT_SYMBOL_GPL(llcc_get_slice_id);
+
+/**
+ * llcc_get_slice_size - return the slice id
+ * @desc: Pointer to llcc slice descriptor
+ */
+size_t llcc_get_slice_size(struct llcc_slice_desc *desc)
+{
+	return desc->slice_size;
+}
+EXPORT_SYMBOL_GPL(llcc_get_slice_size);
+
+static int qcom_llcc_cfg_program(struct platform_device *pdev)
+{
+	int i;
+	u32 attr1_cfg;
+	u32 attr0_cfg;
+	u32 attr1_val;
+	u32 attr0_val;
+	u32 max_cap_cacheline;
+	u32 sz;
+	int ret;
+	const struct llcc_slice_config *llcc_table;
+	struct llcc_slice_desc desc;
+	u32 bcast_off = drv_data->bcast_off;
+
+	sz = drv_data->cfg_size;
+	llcc_table = drv_data->cfg;
+
+	for (i = 0; i < sz; i++) {
+		attr1_cfg = bcast_off +
+				LLCC_TRP_ATTR1_CFGn(llcc_table[i].slice_id);
+		attr0_cfg = bcast_off +
+				LLCC_TRP_ATTR0_CFGn(llcc_table[i].slice_id);
+
+		attr1_val = llcc_table[i].cache_mode;
+		attr1_val |= llcc_table[i].probe_target_ways <<
+				ATTR1_PROBE_TARGET_WAYS_SHIFT;
+		attr1_val |= llcc_table[i].fixed_size <<
+				ATTR1_FIXED_SIZE_SHIFT;
+		attr1_val |= llcc_table[i].priority <<
+				ATTR1_PRIORITY_SHIFT;
+
+		max_cap_cacheline = MAX_CAP_TO_BYTES(llcc_table[i].max_cap);
+
+		/* LLCC instances can vary for each target.
+		 * The SW writes to broadcast register which gets propagated
+		 * to each llcc instace (llcc0,.. llccN).
+		 * Since the size of the memory is divided equally amongst the
+		 * llcc instances, we need to configure the max cap accordingly.
+		 */
+		max_cap_cacheline = max_cap_cacheline / drv_data->num_banks;
+		max_cap_cacheline >>= CACHE_LINE_SIZE_SHIFT;
+		attr1_val |= max_cap_cacheline << ATTR1_MAX_CAP_SHIFT;
+
+		attr0_val = llcc_table[i].res_ways & ATTR0_RES_WAYS_MASK;
+		attr0_val |= llcc_table[i].bonus_ways << ATTR0_BONUS_WAYS_SHIFT;
+
+		ret = regmap_write(drv_data->regmap, attr1_cfg, attr1_val);
+		if (ret)
+			return ret;
+		ret = regmap_write(drv_data->regmap, attr0_cfg, attr0_val);
+		if (ret)
+			return ret;
+		if (llcc_table[i].activate_on_init) {
+			desc.slice_id = llcc_table[i].slice_id;
+			ret = llcc_slice_activate(&desc);
+		}
+	}
+	return ret;
+}
+
+int qcom_llcc_probe(struct platform_device *pdev,
+		      const struct llcc_slice_config *llcc_cfg, u32 sz)
+{
+	u32 num_banks;
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	void __iomem *base;
+	int ret, i;
+
+	drv_data = devm_kzalloc(dev, sizeof(*drv_data), GFP_KERNEL);
+	if (!drv_data)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	drv_data->regmap = devm_regmap_init_mmio(dev, base,
+					&llcc_regmap_config);
+	if (IS_ERR(drv_data->regmap))
+		return PTR_ERR(drv_data->regmap);
+
+	ret = regmap_read(drv_data->regmap, LLCC_COMMON_STATUS0,
+						&num_banks);
+	if (ret)
+		return ret;
+
+	num_banks &= LLCC_LB_CNT_MASK;
+	num_banks >>= LLCC_LB_CNT_SHIFT;
+	drv_data->num_banks = num_banks;
+
+	for (i = 0; i < sz; i++)
+		if (llcc_cfg[i].slice_id > drv_data->max_slices)
+			drv_data->max_slices = llcc_cfg[i].slice_id;
+
+	drv_data->offsets = devm_kcalloc(dev, num_banks, sizeof(u32),
+							GFP_KERNEL);
+	if (!drv_data->offsets)
+		return -ENOMEM;
+
+	for (i = 0; i < num_banks; i++)
+		drv_data->offsets[i] = i * BANK_OFFSET_STRIDE;
+
+	drv_data->bcast_off = num_banks * BANK_OFFSET_STRIDE;
+
+	drv_data->bitmap = devm_kcalloc(dev,
+	BITS_TO_LONGS(drv_data->max_slices), sizeof(unsigned long),
+						GFP_KERNEL);
+	if (!drv_data->bitmap)
+		return -ENOMEM;
+
+	drv_data->cfg = llcc_cfg;
+	drv_data->cfg_size = sz;
+	mutex_init(&drv_data->lock);
+	platform_set_drvdata(pdev, drv_data);
+
+	return qcom_llcc_cfg_program(pdev);
+}
+EXPORT_SYMBOL_GPL(qcom_llcc_probe);
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
new file mode 100644
index 000000000000..7e3b9c605ab2
--- /dev/null
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
+ *
+ */
+
+#include <linux/platform_device.h>
+#ifndef __LLCC_QCOM__
+#define __LLCC_QCOM__
+
+#define LLCC_CPUSS       1
+#define LLCC_VIDSC0      2
+#define LLCC_VIDSC1      3
+#define LLCC_ROTATOR     4
+#define LLCC_VOICE       5
+#define LLCC_AUDIO       6
+#define LLCC_MDMHPGRW    7
+#define LLCC_MDM         8
+#define LLCC_CMPT        10
+#define LLCC_GPUHTW      11
+#define LLCC_GPU         12
+#define LLCC_MMUHWT      13
+#define LLCC_CMPTDMA     15
+#define LLCC_DISP        16
+#define LLCC_VIDFW       17
+#define LLCC_MDMHPFX     20
+#define LLCC_MDMPNG      21
+#define LLCC_AUDHW       22
+
+/**
+ * llcc_slice_desc - Cache slice descriptor
+ * @slice_id: llcc slice id
+ * @slice_size: Size allocated for the llcc slice
+ */
+struct llcc_slice_desc {
+	u32 slice_id;
+	size_t slice_size;
+};
+
+/**
+ * llcc_slice_config - Data associated with the llcc slice
+ * @usecase_id: usecase id for which the llcc slice is used
+ * @slice_id: llcc slice id assigned to each slice
+ * @max_cap: maximum capacity of the llcc slice
+ * @priority: priority of the llcc slice
+ * @fixed_size: whether the llcc slice can grow beyond its size
+ * @bonus_ways: bonus ways associated with llcc slice
+ * @res_ways: reserved ways associated with llcc slice
+ * @cache_mode: mode of the llcc slice
+ * @probe_target_ways: Probe only reserved and bonus ways on a cache miss
+ * @dis_cap_alloc: Disable capacity based allocation
+ * @retain_on_pc: Retain through power collapse
+ * @activate_on_init: activate the slice on init
+ */
+struct llcc_slice_config {
+	u32 usecase_id;
+	u32 slice_id;
+	u32 max_cap;
+	u32 priority;
+	bool fixed_size;
+	u32 bonus_ways;
+	u32 res_ways;
+	u32 cache_mode;
+	u32 probe_target_ways;
+	bool dis_cap_alloc;
+	bool retain_on_pc;
+	bool activate_on_init;
+};
+
+/**
+ * llcc_drv_data - Data associated with the llcc driver
+ * @regmap: regmap associated with the llcc device
+ * @cfg: pointer to the data structure for slice configuration
+ * @lock: mutex associated with each slice
+ * @cfg_size: size of the config data table
+ * @max_slices: max slices as read from device tree
+ * @bcast_off: Offset of the broadcast bank
+ * @num_banks: Number of llcc banks
+ * @bitmap: Bit map to track the active slice ids
+ * @offsets: Pointer to the bank offsets array
+ */
+struct llcc_drv_data {
+	struct regmap *regmap;
+	const struct llcc_slice_config *cfg;
+	struct mutex lock;
+	u32 cfg_size;
+	u32 max_slices;
+	u32 bcast_off;
+	u32 num_banks;
+	unsigned long *bitmap;
+	u32 *offsets;
+};
+
+#if IS_ENABLED(CONFIG_QCOM_LLCC)
+/**
+ * llcc_slice_getd - get llcc slice descriptor
+ * @uid: usecase_id of the client
+ */
+struct llcc_slice_desc *llcc_slice_getd(u32 uid);
+
+/**
+ * llcc_slice_putd - llcc slice descritpor
+ * @desc: Pointer to llcc slice descriptor
+ */
+void llcc_slice_putd(struct llcc_slice_desc *desc);
+
+/**
+ * llcc_get_slice_id - get slice id
+ * @desc: Pointer to llcc slice descriptor
+ */
+int llcc_get_slice_id(struct llcc_slice_desc *desc);
+
+/**
+ * llcc_get_slice_size - llcc slice size
+ * @desc: Pointer to llcc slice descriptor
+ */
+size_t llcc_get_slice_size(struct llcc_slice_desc *desc);
+
+/**
+ * llcc_slice_activate - Activate the llcc slice
+ * @desc: Pointer to llcc slice descriptor
+ */
+int llcc_slice_activate(struct llcc_slice_desc *desc);
+
+/**
+ * llcc_slice_deactivate - Deactivate the llcc slice
+ * @desc: Pointer to llcc slice descriptor
+ */
+int llcc_slice_deactivate(struct llcc_slice_desc *desc);
+
+/**
+ * qcom_llcc_probe - program the sct table
+ * @pdev: platform device pointer
+ * @table: soc sct table
+ * @sz: Size of the config table
+ */
+int qcom_llcc_probe(struct platform_device *pdev,
+		      const struct llcc_slice_config *table, u32 sz);
+#else
+static inline struct llcc_slice_desc *llcc_slice_getd(u32 uid)
+{
+	return NULL;
+}
+
+static inline void llcc_slice_putd(struct llcc_slice_desc *desc)
+{
+
+};
+
+static inline int llcc_get_slice_id(struct llcc_slice_desc *desc)
+{
+	return -EINVAL;
+}
+
+static inline size_t llcc_get_slice_size(struct llcc_slice_desc *desc)
+{
+	return 0;
+}
+static inline int llcc_slice_activate(struct llcc_slice_desc *desc)
+{
+	return -EINVAL;
+}
+
+static inline int llcc_slice_deactivate(struct llcc_slice_desc *desc)
+{
+	return -EINVAL;
+}
+static inline int qcom_llcc_probe(struct platform_device *pdev,
+		      const struct llcc_slice_config *table, u32 sz)
+{
+	return -ENODEV;
+}
+
+static inline int qcom_llcc_remove(struct platform_device *pdev)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif
-- 
cgit v1.2.3


From a0b1561f846191a1967e8f5a26d8ef59c1b9162a Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@linaro.org>
Date: Tue, 12 Jun 2018 15:23:13 +0200
Subject: firmware: qcom: scm: add a dummy qcom_scm_assign_mem()

Add a dummy qcom_scm_assign_mem() to enable building drivers when
CONFIG_COMPILE_TEST=y && CONFIG_QCOM_SCM=n.

All other qcom_scm_* functions already have a dummy version.

Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 include/linux/qcom_scm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index b401b962afff..5d65521260b3 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -87,6 +87,10 @@ static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
 static inline int
 qcom_scm_pas_auth_and_reset(u32 peripheral) { return -ENODEV; }
 static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; }
+static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
+				      unsigned int *src,
+				      struct qcom_scm_vmperm *newvm,
+				      int dest_cnt) { return -ENODEV; }
 static inline void qcom_scm_cpu_power_down(u32 flags) {}
 static inline u32 qcom_scm_get_version(void) { return 0; }
 static inline u32
-- 
cgit v1.2.3


From c4db9c1e8c70bc60e392da8a485bcfb035d559c2 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 20 Jul 2018 10:47:23 +0900
Subject: efi: Deduplicate efi_open_volume()

There's one ARM, one x86_32 and one x86_64 version of efi_open_volume()
which can be folded into a single shared version by masking their
differences with the efi_call_proto() macro introduced by commit:

  3552fdf29f01 ("efi: Allow bitness-agnostic protocol calls").

To be able to dereference the device_handle attribute from the
efi_loaded_image_t table in an arch- and bitness-agnostic manner,
introduce the efi_table_attr() macro (which already exists for x86)
to arm and arm64.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20180720014726.24031-7-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/efi.h                     |  3 ++
 arch/arm64/include/asm/efi.h                   |  3 ++
 arch/x86/boot/compressed/eboot.c               | 63 --------------------------
 drivers/firmware/efi/libstub/arm-stub.c        | 25 ----------
 drivers/firmware/efi/libstub/efi-stub-helper.c | 31 ++++++++++++-
 drivers/firmware/efi/libstub/efistub.h         |  3 --
 include/linux/efi.h                            | 10 ++++
 7 files changed, 45 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index 17f1f1a814ff..38badaae8d9d 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -58,6 +58,9 @@ void efi_virtmap_unload(void);
 #define efi_call_runtime(f, ...)	sys_table_arg->runtime->f(__VA_ARGS__)
 #define efi_is_64bit()			(false)
 
+#define efi_table_attr(table, attr, instance)				\
+	((table##_t *)instance)->attr
+
 #define efi_call_proto(protocol, f, instance, ...)			\
 	((protocol##_t *)instance)->f(instance, ##__VA_ARGS__)
 
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 192d791f1103..7ed320895d1f 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -87,6 +87,9 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
 #define efi_call_runtime(f, ...)	sys_table_arg->runtime->f(__VA_ARGS__)
 #define efi_is_64bit()			(true)
 
+#define efi_table_attr(table, attr, instance)				\
+	((table##_t *)instance)->attr
+
 #define efi_call_proto(protocol, f, instance, ...)			\
 	((protocol##_t *)instance)->f(instance, ##__VA_ARGS__)
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 92b573fd239c..915c64edbe8e 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -41,69 +41,6 @@ static void setup_boot_services##bits(struct efi_config *c)		\
 BOOT_SERVICES(32);
 BOOT_SERVICES(64);
 
-static inline efi_status_t __open_volume32(void *__image, void **__fh)
-{
-	efi_file_io_interface_t *io;
-	efi_loaded_image_32_t *image = __image;
-	efi_file_handle_32_t *fh;
-	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
-	efi_status_t status;
-	void *handle = (void *)(unsigned long)image->device_handle;
-	unsigned long func;
-
-	status = efi_call_early(handle_protocol, handle,
-				&fs_proto, (void **)&io);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to handle fs_proto\n");
-		return status;
-	}
-
-	func = (unsigned long)io->open_volume;
-	status = efi_early->call(func, io, &fh);
-	if (status != EFI_SUCCESS)
-		efi_printk(sys_table, "Failed to open volume\n");
-
-	*__fh = fh;
-
-	return status;
-}
-
-static inline efi_status_t __open_volume64(void *__image, void **__fh)
-{
-	efi_file_io_interface_t *io;
-	efi_loaded_image_64_t *image = __image;
-	efi_file_handle_64_t *fh;
-	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
-	efi_status_t status;
-	void *handle = (void *)(unsigned long)image->device_handle;
-	unsigned long func;
-
-	status = efi_call_early(handle_protocol, handle,
-				&fs_proto, (void **)&io);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table, "Failed to handle fs_proto\n");
-		return status;
-	}
-
-	func = (unsigned long)io->open_volume;
-	status = efi_early->call(func, io, &fh);
-	if (status != EFI_SUCCESS)
-		efi_printk(sys_table, "Failed to open volume\n");
-
-	*__fh = fh;
-
-	return status;
-}
-
-efi_status_t
-efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
-{
-	if (efi_early->is64)
-		return __open_volume64(__image, __fh);
-
-	return __open_volume32(__image, __fh);
-}
-
 void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 {
 	efi_call_proto(efi_simple_text_output_protocol, output_string,
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index c98b1856fc3d..6920033de6d4 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -40,31 +40,6 @@
 
 static u64 virtmap_base = EFI_RT_VIRTUAL_BASE;
 
-efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
-			     void *__image, void **__fh)
-{
-	efi_file_io_interface_t *io;
-	efi_loaded_image_t *image = __image;
-	efi_file_handle_t *fh;
-	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
-	efi_status_t status;
-	void *handle = (void *)(unsigned long)image->device_handle;
-
-	status = sys_table_arg->boottime->handle_protocol(handle,
-				 &fs_proto, (void **)&io);
-	if (status != EFI_SUCCESS) {
-		efi_printk(sys_table_arg, "Failed to handle fs_proto\n");
-		return status;
-	}
-
-	status = io->open_volume(io, &fh);
-	if (status != EFI_SUCCESS)
-		efi_printk(sys_table_arg, "Failed to open volume\n");
-
-	*__fh = fh;
-	return status;
-}
-
 void efi_char16_printk(efi_system_table_t *sys_table_arg,
 			      efi_char16_t *str)
 {
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 50a9cab5a834..e94975f4655b 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -413,6 +413,34 @@ static efi_status_t efi_file_close(void *handle)
 	return efi_call_proto(efi_file_handle, close, handle);
 }
 
+static efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
+				    efi_loaded_image_t *image,
+				    efi_file_handle_t **__fh)
+{
+	efi_file_io_interface_t *io;
+	efi_file_handle_t *fh;
+	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+	efi_status_t status;
+	void *handle = (void *)(unsigned long)efi_table_attr(efi_loaded_image,
+							     device_handle,
+							     image);
+
+	status = efi_call_early(handle_protocol, handle,
+				&fs_proto, (void **)&io);
+	if (status != EFI_SUCCESS) {
+		efi_printk(sys_table_arg, "Failed to handle fs_proto\n");
+		return status;
+	}
+
+	status = efi_call_proto(efi_file_io_interface, open_volume, io, &fh);
+	if (status != EFI_SUCCESS)
+		efi_printk(sys_table_arg, "Failed to open volume\n");
+	else
+		*__fh = fh;
+
+	return status;
+}
+
 /*
  * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi=
  * option, e.g. efi=nochunk.
@@ -563,8 +591,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 
 		/* Only open the volume once. */
 		if (!i) {
-			status = efi_open_volume(sys_table_arg, image,
-						 (void **)&fh);
+			status = efi_open_volume(sys_table_arg, image, &fh);
 			if (status != EFI_SUCCESS)
 				goto free_files;
 		}
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index f59564b72ddc..32799cf039ef 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -36,9 +36,6 @@ extern int __pure is_quiet(void);
 
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
-efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
-			     void **__fh);
-
 unsigned long get_dram_base(efi_system_table_t *sys_table_arg);
 
 efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e190652f5ef9..401e4b254e30 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -894,6 +894,16 @@ typedef struct _efi_file_handle {
 	void *flush;
 } efi_file_handle_t;
 
+typedef struct {
+	u64 revision;
+	u32 open_volume;
+} efi_file_io_interface_32_t;
+
+typedef struct {
+	u64 revision;
+	u64 open_volume;
+} efi_file_io_interface_64_t;
+
 typedef struct _efi_file_io_interface {
 	u64 revision;
 	int (*open_volume)(struct _efi_file_io_interface *,
-- 
cgit v1.2.3


From 40c6f9c28ef03f2f2c3ee58c2447a6e6b9a713f2 Mon Sep 17 00:00:00 2001
From: Revanth Rajashekar <revanth.rajashekar@intel.com>
Date: Fri, 15 Jun 2018 12:39:27 -0600
Subject: nvme.h: resync with nvme-cli

Added some feature ids present in nvme-cli but not kernel.

Signed-off-by: Revanth Rajashekar <revanth.rajashekar@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2950ce957656..80dfedcf0bf7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -749,6 +749,11 @@ enum {
 	NVME_FEAT_HOST_MEM_BUF	= 0x0d,
 	NVME_FEAT_TIMESTAMP	= 0x0e,
 	NVME_FEAT_KATO		= 0x0f,
+	NVME_FEAT_HCTM		= 0x10,
+	NVME_FEAT_NOPSC		= 0x11,
+	NVME_FEAT_RRL		= 0x12,
+	NVME_FEAT_PLM_CONFIG	= 0x13,
+	NVME_FEAT_PLM_WINDOW	= 0x14,
 	NVME_FEAT_SW_PROGRESS	= 0x80,
 	NVME_FEAT_HOST_ID	= 0x81,
 	NVME_FEAT_RESV_MASK	= 0x82,
-- 
cgit v1.2.3


From 977d5ad39f3ea12ac0bd51d75020cea5ecdca235 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 17 Jul 2018 17:19:11 +0300
Subject: ACPI: Convert ACPI reference args to generic fwnode reference args

Convert all users of struct acpi_reference_args to more generic
fwnode_reference_args. This will

 1) avoid an ACPI specific references to device nodes with integer
    arguments as well as

 2) allow making references to nodes other than device nodes in ACPI.

As a by-product, convert the fwnode interger arguments to u64. The
arguments were 64-bit integers on ACPI but the fwnode arguments were
just 32-bit.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c                           | 36 ++++++++---------------
 drivers/gpio/gpiolib-acpi.c                       | 11 ++++---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c        | 10 +++----
 drivers/media/v4l2-core/v4l2-fwnode.c             |  2 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c    |  6 ++--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c |  6 ++--
 drivers/net/ethernet/hisilicon/hns/hns_enet.c     |  8 +++--
 include/linux/acpi.h                              | 17 ++++-------
 include/linux/fwnode.h                            |  2 +-
 9 files changed, 43 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 5815356ea6ad..3fa40010fd67 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -579,7 +579,7 @@ static int acpi_data_get_property_array(const struct acpi_device_data *data,
  */
 int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 	const char *propname, size_t index, size_t num_args,
-	struct acpi_reference_args *args)
+	struct fwnode_reference_args *args)
 {
 	const union acpi_object *element, *end;
 	const union acpi_object *obj;
@@ -607,7 +607,7 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 		if (ret)
 			return ret == -ENODEV ? -EINVAL : ret;
 
-		args->adev = device;
+		args->fwnode = acpi_fwnode_handle(device);
 		args->nargs = 0;
 		return 0;
 	}
@@ -653,11 +653,11 @@ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 					return -EINVAL;
 			}
 
-			if (nargs > MAX_ACPI_REFERENCE_ARGS)
+			if (nargs > NR_FWNODE_REFERENCE_ARGS)
 				return -EINVAL;
 
 			if (idx == index) {
-				args->adev = device;
+				args->fwnode = acpi_fwnode_handle(device);
 				args->nargs = nargs;
 				for (i = 0; i < nargs; i++)
 					args->args[i] = element[i].integer.value;
@@ -1089,7 +1089,7 @@ int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
 {
 	struct fwnode_handle *fwnode;
 	unsigned int port_nr, endpoint_nr;
-	struct acpi_reference_args args;
+	struct fwnode_reference_args args;
 	int ret;
 
 	memset(&args, 0, sizeof(args));
@@ -1098,6 +1098,10 @@ int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
 	if (ret)
 		return ret;
 
+	/* Ensure this is a device node. */
+	if (!is_acpi_device_node(args.fwnode))
+		return -ENODEV;
+
 	/*
 	 * Always require two arguments with the reference: port and
 	 * endpoint indices.
@@ -1105,7 +1109,7 @@ int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
 	if (args.nargs != 2)
 		return -EPROTO;
 
-	fwnode = acpi_fwnode_handle(args.adev);
+	fwnode = args.fwnode;
 	port_nr = args.args[0];
 	endpoint_nr = args.args[1];
 
@@ -1209,24 +1213,8 @@ acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
 			       unsigned int args_count, unsigned int index,
 			       struct fwnode_reference_args *args)
 {
-	struct acpi_reference_args acpi_args;
-	unsigned int i;
-	int ret;
-
-	ret = __acpi_node_get_property_reference(fwnode, prop, index,
-						 args_count, &acpi_args);
-	if (ret < 0)
-		return ret;
-	if (!args)
-		return 0;
-
-	args->nargs = acpi_args.nargs;
-	args->fwnode = acpi_fwnode_handle(acpi_args.adev);
-
-	for (i = 0; i < NR_FWNODE_REFERENCE_ARGS; i++)
-		args->args[i] = i < acpi_args.nargs ? acpi_args.args[i] : 0;
-
-	return 0;
+	return __acpi_node_get_property_reference(fwnode, prop, index,
+						  args_count, args);
 }
 
 static struct fwnode_handle *
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index e2232cbcec8b..4e8fdae1cde4 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(devm_acpi_dev_remove_driver_gpios);
 
 static bool acpi_get_driver_gpio_data(struct acpi_device *adev,
 				      const char *name, int index,
-				      struct acpi_reference_args *args,
+				      struct fwnode_reference_args *args,
 				      unsigned int *quirks)
 {
 	const struct acpi_gpio_mapping *gm;
@@ -365,7 +365,7 @@ static bool acpi_get_driver_gpio_data(struct acpi_device *adev,
 		if (!strcmp(name, gm->name) && gm->data && index < gm->size) {
 			const struct acpi_gpio_params *par = gm->data + index;
 
-			args->adev = adev;
+			args->fwnode = acpi_fwnode_handle(adev);
 			args->args[0] = par->crs_entry_index;
 			args->args[1] = par->line_index;
 			args->args[2] = par->active_low;
@@ -528,7 +528,7 @@ static int acpi_gpio_property_lookup(struct fwnode_handle *fwnode,
 				     const char *propname, int index,
 				     struct acpi_gpio_lookup *lookup)
 {
-	struct acpi_reference_args args;
+	struct fwnode_reference_args args;
 	unsigned int quirks = 0;
 	int ret;
 
@@ -549,6 +549,8 @@ static int acpi_gpio_property_lookup(struct fwnode_handle *fwnode,
 	 * The property was found and resolved, so need to lookup the GPIO based
 	 * on returned args.
 	 */
+	if (!to_acpi_device_node(args.fwnode))
+		return -EINVAL;
 	if (args.nargs != 3)
 		return -EPROTO;
 
@@ -556,8 +558,9 @@ static int acpi_gpio_property_lookup(struct fwnode_handle *fwnode,
 	lookup->pin_index = args.args[1];
 	lookup->active_low = !!args.args[2];
 
-	lookup->info.adev = args.adev;
+	lookup->info.adev = to_acpi_device_node(args.fwnode);
 	lookup->info.quirks = quirks;
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 8013d69c5ac4..8444234ed092 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -1435,7 +1435,7 @@ static int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool dereset)
 		}
 		fwnode = &dsaf_node->fwnode;
 	} else if (is_acpi_device_node(dev->fwnode)) {
-		struct acpi_reference_args args;
+		struct fwnode_reference_args args;
 
 		ret = acpi_node_get_property_reference(dev->fwnode,
 						       "dsaf-handle", 0, &args);
@@ -1443,7 +1443,7 @@ static int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool dereset)
 			dev_err(dev, "could not find dsaf-handle\n");
 			return ret;
 		}
-		fwnode = acpi_fwnode_handle(args.adev);
+		fwnode = args.fwnode;
 	} else {
 		dev_err(dev, "cannot read data from DT or ACPI\n");
 		return -ENXIO;
@@ -4835,16 +4835,14 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev)
 				continue;
 			pdev = of_find_device_by_node(net_node);
 		} else if (is_acpi_device_node(dev->fwnode)) {
-			struct acpi_reference_args args;
-			struct fwnode_handle *fwnode;
+			struct fwnode_reference_args args;
 
 			ret = acpi_node_get_property_reference(dev->fwnode,
 							       "eth-handle",
 							       i, &args);
 			if (ret)
 				continue;
-			fwnode = acpi_fwnode_handle(args.adev);
-			pdev = hns_roce_find_pdev(fwnode);
+			pdev = hns_roce_find_pdev(args.fwnode);
 		} else {
 			dev_err(dev, "cannot read data from DT or ACPI\n");
 			return -ENXIO;
diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index 3f77aa318035..82595cebc0b8 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -739,7 +739,7 @@ static struct fwnode_handle *v4l2_fwnode_reference_get_int_prop(
 	const char * const *props, unsigned int nprops)
 {
 	struct fwnode_reference_args fwnode_args;
-	unsigned int *args = fwnode_args.args;
+	u64 *args = fwnode_args.args;
 	struct fwnode_handle *child;
 	int ret;
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 3188f553da35..078a04dc1182 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -836,19 +836,19 @@ static void xgene_enet_adjust_link(struct net_device *ndev)
 #ifdef CONFIG_ACPI
 static struct acpi_device *acpi_phy_find_device(struct device *dev)
 {
-	struct acpi_reference_args args;
+	struct fwnode_reference_args args;
 	struct fwnode_handle *fw_node;
 	int status;
 
 	fw_node = acpi_fwnode_handle(ACPI_COMPANION(dev));
 	status = acpi_node_get_property_reference(fw_node, "phy-handle", 0,
 						  &args);
-	if (ACPI_FAILURE(status)) {
+	if (ACPI_FAILURE(status) || !is_acpi_device_node(args.fwnode)) {
 		dev_dbg(dev, "No matching phy in ACPI table\n");
 		return NULL;
 	}
 
-	return args.adev;
+	return to_acpi_device_node(args.fwnode);
 }
 #endif
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
index 9dcc5765f11f..794516718d9d 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -708,7 +708,7 @@ hns_mac_register_phydev(struct mii_bus *mdio, struct hns_mac_cb *mac_cb,
 
 static int hns_mac_register_phy(struct hns_mac_cb *mac_cb)
 {
-	struct acpi_reference_args args;
+	struct fwnode_reference_args args;
 	struct platform_device *pdev;
 	struct mii_bus *mii_bus;
 	int rc;
@@ -722,13 +722,15 @@ static int hns_mac_register_phy(struct hns_mac_cb *mac_cb)
 			mac_cb->fw_port, "mdio-node", 0, &args);
 	if (rc)
 		return rc;
+	if (!is_acpi_device_node(args.fwnode))
+		return -EINVAL;
 
 	addr = hns_mac_phy_parse_addr(mac_cb->dev, mac_cb->fw_port);
 	if (addr < 0)
 		return addr;
 
 	/* dev address in adev */
-	pdev = hns_dsaf_find_platform_device(acpi_fwnode_handle(args.adev));
+	pdev = hns_dsaf_find_platform_device(args.fwnode);
 	if (!pdev) {
 		dev_err(mac_cb->dev, "mac%d mdio pdev is NULL\n",
 			mac_cb->mac_id);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ef9ef703d13a..5608f807d7ba 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2377,7 +2377,7 @@ static int hns_nic_dev_probe(struct platform_device *pdev)
 		}
 		priv->fwnode = &ae_node->fwnode;
 	} else if (is_acpi_node(dev->fwnode)) {
-		struct acpi_reference_args args;
+		struct fwnode_reference_args args;
 
 		if (acpi_dev_found(hns_enet_acpi_match[0].id))
 			priv->enet_ver = AE_VERSION_1;
@@ -2393,7 +2393,11 @@ static int hns_nic_dev_probe(struct platform_device *pdev)
 			dev_err(dev, "not find ae-handle\n");
 			goto out_read_prop_fail;
 		}
-		priv->fwnode = acpi_fwnode_handle(args.adev);
+		if (!is_acpi_device_node(args.fwnode)) {
+			ret = -EINVAL;
+			goto out_read_prop_fail;
+		}
+		priv->fwnode = args.fwnode;
 	} else {
 		dev_err(dev, "cannot read cfg data from OF or acpi\n");
 		return -ENXIO;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e54f40974eb0..11cf39719fd8 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1058,27 +1058,20 @@ static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 
 /* Device properties */
 
-#define MAX_ACPI_REFERENCE_ARGS	8
-struct acpi_reference_args {
-	struct acpi_device *adev;
-	size_t nargs;
-	u64 args[MAX_ACPI_REFERENCE_ARGS];
-};
-
 #ifdef CONFIG_ACPI
 int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
 			  acpi_object_type type, const union acpi_object **obj);
 int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 				const char *name, size_t index, size_t num_args,
-				struct acpi_reference_args *args);
+				struct fwnode_reference_args *args);
 
 static inline int acpi_node_get_property_reference(
 				const struct fwnode_handle *fwnode,
 				const char *name, size_t index,
-				struct acpi_reference_args *args)
+				struct fwnode_reference_args *args)
 {
 	return __acpi_node_get_property_reference(fwnode, name, index,
-		MAX_ACPI_REFERENCE_ARGS, args);
+		NR_FWNODE_REFERENCE_ARGS, args);
 }
 
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
@@ -1169,7 +1162,7 @@ static inline int acpi_dev_get_property(struct acpi_device *adev,
 static inline int
 __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 				const char *name, size_t index, size_t num_args,
-				struct acpi_reference_args *args)
+				struct fwnode_reference_args *args)
 {
 	return -ENXIO;
 }
@@ -1177,7 +1170,7 @@ __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 static inline int
 acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 				 const char *name, size_t index,
-				 struct acpi_reference_args *args)
+				 struct fwnode_reference_args *args)
 {
 	return -ENXIO;
 }
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 4fe8f289b3f6..faebf0ca0686 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -45,7 +45,7 @@ struct fwnode_endpoint {
 struct fwnode_reference_args {
 	struct fwnode_handle *fwnode;
 	unsigned int nargs;
-	unsigned int args[NR_FWNODE_REFERENCE_ARGS];
+	u64 args[NR_FWNODE_REFERENCE_ARGS];
 };
 
 /**
-- 
cgit v1.2.3


From 0ef7478639c5165a672f01b4024aacfffa951813 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 17 Jul 2018 17:19:14 +0300
Subject: ACPI: property: Make the ACPI graph API private

The fwnode graph API is preferred over the ACPI graph API. Therefore
make the ACPI graph API private, and use it as a back-end for the
fwnode graph API only.

Unused functionality is removed while the functionality actually used
remains the same.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 83 ++++++++++---------------------------------------
 include/linux/acpi.h    |  8 -----
 2 files changed, 16 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 5878d3678b38..19bdada64435 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1033,10 +1033,10 @@ struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode)
  * @prev: Previous endpoint node or %NULL to get the first
  *
  * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
- * %NULL if there is no next endpoint, ERR_PTR() in case of error. In case
- * of success the next endpoint is returned.
+ * %NULL if there is no next endpoint or in case of error. In case of success
+ * the next endpoint is returned.
  */
-struct fwnode_handle *acpi_graph_get_next_endpoint(
+static struct fwnode_handle *acpi_graph_get_next_endpoint(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *prev)
 {
 	struct fwnode_handle *port = NULL;
@@ -1065,11 +1065,9 @@ struct fwnode_handle *acpi_graph_get_next_endpoint(
 			endpoint = fwnode_get_next_child_node(port, NULL);
 	}
 
-	if (endpoint) {
-		/* Endpoints must have "endpoint" property */
-		if (!fwnode_property_present(endpoint, "endpoint"))
-			return ERR_PTR(-EPROTO);
-	}
+	/* Endpoints must have "endpoint" property */
+	if (!fwnode_property_present(endpoint, "endpoint"))
+		return NULL;
 
 	return endpoint;
 }
@@ -1106,18 +1104,12 @@ static struct fwnode_handle *acpi_graph_get_child_prop_value(
 /**
  * acpi_graph_get_remote_enpoint - Parses and returns remote end of an endpoint
  * @fwnode: Endpoint firmware node pointing to a remote device
- * @parent: Firmware node of remote port parent is filled here if not %NULL
- * @port: Firmware node of remote port is filled here if not %NULL
  * @endpoint: Firmware node of remote endpoint is filled here if not %NULL
  *
- * Function parses remote end of ACPI firmware remote endpoint and fills in
- * fields requested by the caller. Returns %0 in case of success and
- * negative errno otherwise.
+ * Returns the remote endpoint corresponding to @__fwnode. NULL on error.
  */
-int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
-				   struct fwnode_handle **parent,
-				   struct fwnode_handle **port,
-				   struct fwnode_handle **endpoint)
+static struct fwnode_handle *
+acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode)
 {
 	struct fwnode_handle *fwnode;
 	unsigned int port_nr, endpoint_nr;
@@ -1128,47 +1120,27 @@ int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
 	ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0,
 					       &args);
 	if (ret)
-		return ret;
+		return NULL;
 
 	/* Ensure this is a device node. */
 	if (!is_acpi_device_node(args.fwnode))
-		return -ENODEV;
+		return NULL;
 
 	/*
 	 * Always require two arguments with the reference: port and
 	 * endpoint indices.
 	 */
 	if (args.nargs != 2)
-		return -EPROTO;
+		return NULL;
 
 	fwnode = args.fwnode;
 	port_nr = args.args[0];
 	endpoint_nr = args.args[1];
 
-	if (parent)
-		*parent = fwnode;
-
-	if (!port && !endpoint)
-		return 0;
-
 	fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);
-	if (!fwnode)
-		return -EPROTO;
-
-	if (port)
-		*port = fwnode;
-
-	if (!endpoint)
-		return 0;
-
-	fwnode = acpi_graph_get_child_prop_value(fwnode, "endpoint",
-						 endpoint_nr);
-	if (!fwnode)
-		return -EPROTO;
 
-	*endpoint = fwnode;
-
-	return 0;
+	return acpi_graph_get_child_prop_value(fwnode, "endpoint",
+					       endpoint_nr);
 }
 
 static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode)
@@ -1232,29 +1204,6 @@ acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
 						  args_count, args);
 }
 
-static struct fwnode_handle *
-acpi_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
-				    struct fwnode_handle *prev)
-{
-	struct fwnode_handle *endpoint;
-
-	endpoint = acpi_graph_get_next_endpoint(fwnode, prev);
-	if (IS_ERR(endpoint))
-		return NULL;
-
-	return endpoint;
-}
-
-static struct fwnode_handle *
-acpi_fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
-{
-	struct fwnode_handle *endpoint = NULL;
-
-	acpi_graph_get_remote_endpoint(fwnode, NULL, NULL, &endpoint);
-
-	return endpoint;
-}
-
 static struct fwnode_handle *
 acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
 {
@@ -1295,9 +1244,9 @@ acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 		.get_named_child_node = acpi_fwnode_get_named_child_node, \
 		.get_reference_args = acpi_fwnode_get_reference_args,	\
 		.graph_get_next_endpoint =				\
-			acpi_fwnode_graph_get_next_endpoint,		\
+			acpi_graph_get_next_endpoint,			\
 		.graph_get_remote_endpoint =				\
-			acpi_fwnode_graph_get_remote_endpoint,		\
+			acpi_graph_get_remote_endpoint,			\
 		.graph_get_port_parent = acpi_fwnode_get_parent,	\
 		.graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \
 	};								\
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 11cf39719fd8..de8d3d3fa651 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1089,14 +1089,6 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
 struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode);
 
-struct fwnode_handle *
-acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
-			     struct fwnode_handle *prev);
-int acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
-				   struct fwnode_handle **remote,
-				   struct fwnode_handle **port,
-				   struct fwnode_handle **endpoint);
-
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
 						 struct acpi_probe_entry *);
-- 
cgit v1.2.3


From 0e3fd810c4f41dbd63fb7caddc11684959176727 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 10 Jul 2018 16:46:41 +0200
Subject: Documentation: document ktime_get_*() APIs

As Dave Chinner points out, we don't have a proper documentation for the
ktime_get() family of interfaces, making it rather unclear which of the
over 30 (!) interfaces one should actually use in a driver or elsewhere
in the kernel.

I wrote up an explanation from how I personally see the interfaces,
documenting what each of the functions do and hopefully making it a bit
clearer which should be used where.

This is the first time I tried writing .rst format documentation, so
in addition to any mistakes in the content, I probably also introduce
nonstandard formatting ;-)

I first tried to add an extra section to
Documentation/timers/timekeeping.txt, but this is currently not included
in the generated API, and it seems useful to have the API docs as part
of what gets generated in
https://www.kernel.org/doc/html/latest/core-api/index.html#core-utilities
instead, so I started a new file there.

I also considered adding the documentation inline in the
include/linux/timekeeping.h header, but couldn't figure out how to do
that in a way that would result both in helpful inline comments as
well as readable html output, so I settled for the latter, with
a small note pointing to it from the header.

Cc: Dave Chinner <david@fromorbit.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/index.rst       |   1 +
 Documentation/core-api/timekeeping.rst | 185 +++++++++++++++++++++++++++++++++
 include/linux/timekeeping.h            |  15 +++
 3 files changed, 201 insertions(+)
 create mode 100644 Documentation/core-api/timekeeping.rst

(limited to 'include/linux')

diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index f5a66b72f984..989c97cc232a 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -28,6 +28,7 @@ Core utilities
    printk-formats
    circular-buffers
    gfp_mask-from-fs-io
+   timekeeping
 
 Interfaces for kernel debugging
 ===============================
diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
new file mode 100644
index 000000000000..93cbeb9daec0
--- /dev/null
+++ b/Documentation/core-api/timekeeping.rst
@@ -0,0 +1,185 @@
+ktime accessors
+===============
+
+Device drivers can read the current time using ktime_get() and the many
+related functions declared in linux/timekeeping.h. As a rule of thumb,
+using an accessor with a shorter name is preferred over one with a longer
+name if both are equally fit for a particular use case.
+
+Basic ktime_t based interfaces
+------------------------------
+
+The recommended simplest form returns an opaque ktime_t, with variants
+that return time for different clock references:
+
+
+.. c:function:: ktime_t ktime_get( void )
+
+	CLOCK_MONOTONIC
+
+	Useful for reliable timestamps and measuring short time intervals
+	accurately. Starts at system boot time but stops during suspend.
+
+.. c:function:: ktime_t ktime_get_boottime( void )
+
+	CLOCK_BOOTTIME
+
+	Like ktime_get(), but does not stop when suspended. This can be
+	used e.g. for key expiration times that need to be synchronized
+	with other machines across a suspend operation.
+
+.. c:function:: ktime_t ktime_get_real( void )
+
+	CLOCK_REALTIME
+
+	Returns the time in relative to the UNIX epoch starting in 1970
+	using the Coordinated Universal Time (UTC), same as gettimeofday()
+	user space. This is used for all timestamps that need to
+	persist across a reboot, like inode times, but should be avoided
+	for internal uses, since it can jump backwards due to a leap
+	second update, NTP adjustment settimeofday() operation from user
+	space.
+
+.. c:function:: ktime_t ktime_get_clocktai( void )
+
+	 CLOCK_TAI
+
+	Like ktime_get_real(), but uses the International Atomic Time (TAI)
+	reference instead of UTC to avoid jumping on leap second updates.
+	This is rarely useful in the kernel.
+
+.. c:function:: ktime_t ktime_get_raw( void )
+
+	CLOCK_MONOTONIC_RAW
+
+	Like ktime_get(), but runs at the same rate as the hardware
+	clocksource without (NTP) adjustments for clock drift. This is
+	also rarely needed in the kernel.
+
+nanosecond, timespec64, and second output
+-----------------------------------------
+
+For all of the above, there are variants that return the time in a
+different format depending on what is required by the user:
+
+.. c:function:: u64 ktime_get_ns( void )
+		u64 ktime_get_boottime_ns( void )
+		u64 ktime_get_real_ns( void )
+		u64 ktime_get_tai_ns( void )
+		u64 ktime_get_raw_ns( void )
+
+	Same as the plain ktime_get functions, but returning a u64 number
+	of nanoseconds in the respective time reference, which may be
+	more convenient for some callers.
+
+.. c:function:: void ktime_get_ts64( struct timespec64 * )
+		void ktime_get_boottime_ts64( struct timespec64 * )
+		void ktime_get_real_ts64( struct timespec64 * )
+		void ktime_get_clocktai_ts64( struct timespec64 * )
+		void ktime_get_raw_ts64( struct timespec64 * )
+
+	Same above, but returns the time in a 'struct timespec64', split
+	into seconds and nanoseconds. This can avoid an extra division
+	when printing the time, or when passing it into an external
+	interface that expects a 'timespec' or 'timeval' structure.
+
+.. c:function:: time64_t ktime_get_seconds( void )
+		time64_t ktime_get_boottime_seconds( void )
+		time64_t ktime_get_real_seconds( void )
+		time64_t ktime_get_clocktai_seconds( void )
+		time64_t ktime_get_raw_seconds( void )
+
+	Return a coarse-grained version of the time as a scalar
+	time64_t. This avoids accessing the clock hardware and rounds
+	down the seconds to the full seconds of the last timer tick
+	using the respective reference.
+
+Coarse and fast_ns access
+-------------------------
+
+Some additional variants exist for more specialized cases:
+
+.. c:function:: ktime_t ktime_get_coarse_boottime( void )
+		ktime_t ktime_get_coarse_real( void )
+		ktime_t ktime_get_coarse_clocktai( void )
+		ktime_t ktime_get_coarse_raw( void )
+
+.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
+		void ktime_get_coarse_boottime_ts64( struct timespec64 * )
+		void ktime_get_coarse_real_ts64( struct timespec64 * )
+		void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
+		void ktime_get_coarse_raw_ts64( struct timespec64 * )
+
+	These are quicker than the non-coarse versions, but less accurate,
+	corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
+	in user space, along with the equivalent boottime/tai/raw
+	timebase not available in user space.
+
+	The time returned here corresponds to the last timer tick, which
+	may be as much as 10ms in the past (for CONFIG_HZ=100), same as
+	reading the 'jiffies' variable.  These are only useful when called
+	in a fast path and one still expects better than second accuracy,
+	but can't easily use 'jiffies', e.g. for inode timestamps.
+	Skipping the hardware clock access saves around 100 CPU cycles
+	on most modern machines with a reliable cycle counter, but
+	up to several microseconds on older hardware with an external
+	clocksource.
+
+.. c:function:: u64 ktime_get_mono_fast_ns( void )
+		u64 ktime_get_raw_fast_ns( void )
+		u64 ktime_get_boot_fast_ns( void )
+		u64 ktime_get_real_fast_ns( void )
+
+	These variants are safe to call from any context, including from
+	a non-maskable interrupt (NMI) during a timekeeper update, and
+	while we are entering suspend with the clocksource powered down.
+	This is useful in some tracing or debugging code as well as
+	machine check reporting, but most drivers should never call them,
+	since the time is allowed to jump under certain conditions.
+
+Deprecated time interfaces
+--------------------------
+
+Older kernels used some other interfaces that are now being phased out
+but may appear in third-party drivers being ported here. In particular,
+all interfaces returning a 'struct timeval' or 'struct timespec' have
+been replaced because the tv_sec member overflows in year 2038 on 32-bit
+architectures. These are the recommended replacements:
+
+.. c:function:: void ktime_get_ts( struct timespec * )
+
+	Use ktime_get() or ktime_get_ts64() instead.
+
+.. c:function:: struct timeval do_gettimeofday( void )
+		struct timespec getnstimeofday( void )
+		struct timespec64 getnstimeofday64( void )
+		void ktime_get_real_ts( struct timespec * )
+
+	ktime_get_real_ts64() is a direct replacement, but consider using
+	monotonic time (ktime_get_ts64()) and/or a ktime_t based interface
+	(ktime_get()/ktime_get_real()).
+
+.. c:function:: struct timespec current_kernel_time( void )
+		struct timespec64 current_kernel_time64( void )
+		struct timespec get_monotonic_coarse( void )
+		struct timespec64 get_monotonic_coarse64( void )
+
+	These are replaced by ktime_get_coarse_real_ts64() and
+	ktime_get_coarse_ts64(). However, A lot of code that wants
+	coarse-grained times can use the simple 'jiffies' instead, while
+	some drivers may actually want the higher resolution accessors
+	these days.
+
+.. c:function:: struct timespec getrawmonotonic( void )
+		struct timespec64 getrawmonotonic64( void )
+		struct timespec timekeeping_clocktai( void )
+		struct timespec64 timekeeping_clocktai64( void )
+		struct timespec get_monotonic_boottime( void )
+		struct timespec64 get_monotonic_boottime64( void )
+
+	These are replaced by ktime_get_raw()/ktime_get_raw_ts64(),
+	ktime_get_clocktai()/ktime_get_clocktai_ts64() as well
+	as ktime_get_boottime()/ktime_get_boottime_ts64().
+	However, if the particular choice of clock source is not
+	important for the user, consider converting to
+	ktime_get()/ktime_get_ts64() instead for consistency.
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 86bc2026efce..947b1b8d2d01 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -20,6 +20,21 @@ extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
 				 const struct timezone *tz);
 
+/*
+ * ktime_get() family: read the current time in a multitude of ways,
+ *
+ * The default time reference is CLOCK_MONOTONIC, starting at
+ * boot time but not counting the time spent in suspend.
+ * For other references, use the functions with "real", "clocktai",
+ * "boottime" and "raw" suffixes.
+ *
+ * To get the time in a different format, use the ones wit
+ * "ns", "ts64" and "seconds" suffix.
+ *
+ * See Documentation/core-api/timekeeping.rst for more details.
+ */
+
+
 /*
  * timespec64 based interfaces
  */
-- 
cgit v1.2.3


From f53aaa31cce7b543e407da7e97690a700206f7b9 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Mon, 16 Jul 2018 15:22:01 -0700
Subject: net/mlx5: FW tracer, implement tracer logic

Implement FW tracer logic and registers access, initialization and
cleanup flows.

Initializing the tracer will be part of load one flow, as multiple
PFs will try to acquire ownership but only one will succeed and will
be the tracer owner.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/diag/fw_tracer.c   | 196 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/diag/fw_tracer.h   |  66 +++++++
 include/linux/mlx5/driver.h                        |   3 +
 3 files changed, 265 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
new file mode 100644
index 000000000000..3ecbf06b4d71
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "fw_tracer.h"
+
+static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer)
+{
+	u32 *string_db_base_address_out = tracer->str_db.base_address_out;
+	u32 *string_db_size_out = tracer->str_db.size_out;
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	void *mtrc_cap_sp;
+	int err, i;
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CAP, 0, 0);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Error reading tracer caps %d\n",
+			       err);
+		return err;
+	}
+
+	if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) {
+		mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n");
+		return -ENOTSUPP;
+	}
+
+	tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver);
+	tracer->str_db.first_string_trace =
+			MLX5_GET(mtrc_cap, out, first_string_trace);
+	tracer->str_db.num_string_trace =
+			MLX5_GET(mtrc_cap, out, num_string_trace);
+	tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db);
+	tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner);
+
+	for (i = 0; i < tracer->str_db.num_string_db; i++) {
+		mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]);
+		string_db_base_address_out[i] = MLX5_GET(mtrc_string_db_param,
+							 mtrc_cap_sp,
+							 string_db_base_address);
+		string_db_size_out[i] = MLX5_GET(mtrc_string_db_param,
+						 mtrc_cap_sp, string_db_size);
+	}
+
+	return err;
+}
+
+static int mlx5_set_mtrc_caps_trace_owner(struct mlx5_fw_tracer *tracer,
+					  u32 *out, u32 out_size,
+					  u8 trace_owner)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+
+	MLX5_SET(mtrc_cap, in, trace_owner, trace_owner);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out, out_size,
+				    MLX5_REG_MTRC_CAP, 0, 1);
+}
+
+static int mlx5_fw_tracer_ownership_acquire(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	int err;
+
+	err = mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out),
+					     MLX5_FW_TRACER_ACQUIRE_OWNERSHIP);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Acquire tracer ownership failed %d\n",
+			       err);
+		return err;
+	}
+
+	tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner);
+
+	if (!tracer->owner)
+		return -EBUSY;
+
+	return 0;
+}
+
+static void mlx5_fw_tracer_ownership_release(struct mlx5_fw_tracer *tracer)
+{
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+
+	mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out),
+				       MLX5_FW_TRACER_RELEASE_OWNERSHIP);
+	tracer->owner = false;
+}
+
+static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer,
+						     ownership_change_work);
+	struct mlx5_core_dev *dev = tracer->dev;
+	int err;
+
+	if (tracer->owner) {
+		mlx5_fw_tracer_ownership_release(tracer);
+		return;
+	}
+
+	err = mlx5_fw_tracer_ownership_acquire(tracer);
+	if (err) {
+		mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err);
+		return;
+	}
+}
+
+struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fw_tracer *tracer = NULL;
+	int err;
+
+	if (!MLX5_CAP_MCAM_REG(dev, tracer_registers)) {
+		mlx5_core_dbg(dev, "FWTracer: Tracer capability not present\n");
+		return NULL;
+	}
+
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		return ERR_PTR(-ENOMEM);
+
+	tracer->work_queue = create_singlethread_workqueue("mlx5_fw_tracer");
+	if (!tracer->work_queue) {
+		err = -ENOMEM;
+		goto free_tracer;
+	}
+
+	tracer->dev = dev;
+
+	INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change);
+
+	err = mlx5_query_mtrc_caps(tracer);
+	if (err) {
+		mlx5_core_dbg(dev, "FWTracer: Failed to query capabilities %d\n", err);
+		goto destroy_workqueue;
+	}
+
+	mlx5_fw_tracer_ownership_change(&tracer->ownership_change_work);
+
+	return tracer;
+
+destroy_workqueue:
+	tracer->dev = NULL;
+	destroy_workqueue(tracer->work_queue);
+free_tracer:
+	kfree(tracer);
+	return ERR_PTR(err);
+}
+
+void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
+{
+	if (!tracer)
+		return;
+
+	cancel_work_sync(&tracer->ownership_change_work);
+
+	if (tracer->owner)
+		mlx5_fw_tracer_ownership_release(tracer);
+
+	flush_workqueue(tracer->work_queue);
+	destroy_workqueue(tracer->work_queue);
+	kfree(tracer);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
new file mode 100644
index 000000000000..721c41a5e827
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_TRACER_H__
+#define __LIB_TRACER_H__
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+#define STRINGS_DB_SECTIONS_NUM 8
+
+struct mlx5_fw_tracer {
+	struct mlx5_core_dev *dev;
+	bool owner;
+	u8   trc_ver;
+	struct workqueue_struct *work_queue;
+	struct work_struct ownership_change_work;
+
+	/* Strings DB */
+	struct {
+		u8 first_string_trace;
+		u8 num_string_trace;
+		u32 num_string_db;
+		u32 base_address_out[STRINGS_DB_SECTIONS_NUM];
+		u32 size_out[STRINGS_DB_SECTIONS_NUM];
+	} str_db;
+};
+
+enum mlx5_fw_tracer_ownership_state {
+	MLX5_FW_TRACER_RELEASE_OWNERSHIP,
+	MLX5_FW_TRACER_ACQUIRE_OWNERSHIP,
+};
+
+struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev);
+void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer);
+
+#endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 957199c20a0f..86cb0ebf92fa 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -816,6 +816,8 @@ struct mlx5_clock {
 	struct mlx5_pps            pps_info;
 };
 
+struct mlx5_fw_tracer;
+
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
 	/* sync pci state */
@@ -860,6 +862,7 @@ struct mlx5_core_dev {
 	struct mlx5_clock        clock;
 	struct mlx5_ib_clock_info  *clock_info;
 	struct page             *clock_info_page;
+	struct mlx5_fw_tracer   *tracer;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From c71ad41ccb0c29fce95149b74786574b354c9dda Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Wed, 7 Feb 2018 11:08:56 +0200
Subject: net/mlx5: FW tracer, events handling

The tracer has one event, event 0x26, with two subtypes:
- Subtype 0: Ownership change
- Subtype 1: Traces available

An ownership change occurs in the following cases:
1- Owner releases his ownership, in this case, an event will be
sent to inform others to reattempt acquire ownership.
2- Ownership was taken by a higher priority tool, in this case
the owner should understand that it lost ownership, and go through
tear down flow.

The second subtype indicates that there are traces in the trace buffer,
in this case, the driver polls the tracer buffer for new traces, parse
them and prepares the messages for printing.

The HW starts tracing from the first address in the tracer buffer.
Driver receives an event notifying that new trace block exists.
HW posts a timestamp event at the last 8B of every 256B block.
Comparing the timestamp to the last handled timestamp would indicate
that this is a new trace block. Once the new timestamp is detected,
the entire block is considered valid.

Block validation and parsing, should be done after copying the current
block to a different location, in order to avoid block overwritten
during processing.

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/diag/fw_tracer.c   | 268 ++++++++++++++++++++-
 .../ethernet/mellanox/mlx5/core/diag/fw_tracer.h   |  71 +++++-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  11 +
 include/linux/mlx5/device.h                        |   7 +
 4 files changed, 347 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
index d6cc27b0ff34..bd887d1d3396 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
@@ -318,25 +318,244 @@ out:
 	return;
 }
 
-static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
+static void mlx5_fw_tracer_arm(struct mlx5_core_dev *dev)
 {
-	struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer,
-						     ownership_change_work);
-	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
 	int err;
 
-	if (tracer->owner) {
-		mlx5_fw_tracer_ownership_release(tracer);
+	MLX5_SET(mtrc_ctrl, in, arm_event, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CTRL, 0, 1);
+	if (err)
+		mlx5_core_warn(dev, "FWTracer: Failed to arm tracer event %d\n", err);
+}
+
+static void poll_trace(struct mlx5_fw_tracer *tracer,
+		       struct tracer_event *tracer_event, u64 *trace)
+{
+	u32 timestamp_low, timestamp_mid, timestamp_high, urts;
+
+	tracer_event->event_id = MLX5_GET(tracer_event, trace, event_id);
+	tracer_event->lost_event = MLX5_GET(tracer_event, trace, lost);
+
+	switch (tracer_event->event_id) {
+	case TRACER_EVENT_TYPE_TIMESTAMP:
+		tracer_event->type = TRACER_EVENT_TYPE_TIMESTAMP;
+		urts = MLX5_GET(tracer_timestamp_event, trace, urts);
+		if (tracer->trc_ver == 0)
+			tracer_event->timestamp_event.unreliable = !!(urts >> 2);
+		else
+			tracer_event->timestamp_event.unreliable = !!(urts & 1);
+
+		timestamp_low = MLX5_GET(tracer_timestamp_event,
+					 trace, timestamp7_0);
+		timestamp_mid = MLX5_GET(tracer_timestamp_event,
+					 trace, timestamp39_8);
+		timestamp_high = MLX5_GET(tracer_timestamp_event,
+					  trace, timestamp52_40);
+
+		tracer_event->timestamp_event.timestamp =
+				((u64)timestamp_high << 40) |
+				((u64)timestamp_mid << 8) |
+				(u64)timestamp_low;
+		break;
+	default:
+		if (tracer_event->event_id >= tracer->str_db.first_string_trace ||
+		    tracer_event->event_id <= tracer->str_db.first_string_trace +
+					      tracer->str_db.num_string_trace) {
+			tracer_event->type = TRACER_EVENT_TYPE_STRING;
+			tracer_event->string_event.timestamp =
+				MLX5_GET(tracer_string_event, trace, timestamp);
+			tracer_event->string_event.string_param =
+				MLX5_GET(tracer_string_event, trace, string_param);
+			tracer_event->string_event.tmsn =
+				MLX5_GET(tracer_string_event, trace, tmsn);
+			tracer_event->string_event.tdsn =
+				MLX5_GET(tracer_string_event, trace, tdsn);
+		} else {
+			tracer_event->type = TRACER_EVENT_TYPE_UNRECOGNIZED;
+		}
+		break;
+	}
+}
+
+static u64 get_block_timestamp(struct mlx5_fw_tracer *tracer, u64 *ts_event)
+{
+	struct tracer_event tracer_event;
+	u8 event_id;
+
+	event_id = MLX5_GET(tracer_event, ts_event, event_id);
+
+	if (event_id == TRACER_EVENT_TYPE_TIMESTAMP)
+		poll_trace(tracer, &tracer_event, ts_event);
+	else
+		tracer_event.timestamp_event.timestamp = 0;
+
+	return tracer_event.timestamp_event.timestamp;
+}
+
+static void mlx5_fw_tracer_handle_traces(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer =
+			container_of(work, struct mlx5_fw_tracer, handle_traces_work);
+	u64 block_timestamp, last_block_timestamp, tmp_trace_block[TRACES_PER_BLOCK];
+	u32 block_count, start_offset, prev_start_offset, prev_consumer_index;
+	u32 trace_event_size = MLX5_ST_SZ_BYTES(tracer_event);
+	struct tracer_event tracer_event;
+	struct mlx5_core_dev *dev;
+	int i;
+
+	if (!tracer->owner)
 		return;
+
+	dev = tracer->dev;
+	block_count = tracer->buff.size / TRACER_BLOCK_SIZE_BYTE;
+	start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE;
+
+	/* Copy the block to local buffer to avoid HW override while being processed*/
+	memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset,
+	       TRACER_BLOCK_SIZE_BYTE);
+
+	block_timestamp =
+		get_block_timestamp(tracer, &tmp_trace_block[TRACES_PER_BLOCK - 1]);
+
+	while (block_timestamp > tracer->last_timestamp) {
+		/* Check block override if its not the first block */
+		if (!tracer->last_timestamp) {
+			u64 *ts_event;
+			/* To avoid block override be the HW in case of buffer
+			 * wraparound, the time stamp of the previous block
+			 * should be compared to the last timestamp handled
+			 * by the driver.
+			 */
+			prev_consumer_index =
+				(tracer->buff.consumer_index - 1) & (block_count - 1);
+			prev_start_offset = prev_consumer_index * TRACER_BLOCK_SIZE_BYTE;
+
+			ts_event = tracer->buff.log_buf + prev_start_offset +
+				   (TRACES_PER_BLOCK - 1) * trace_event_size;
+			last_block_timestamp = get_block_timestamp(tracer, ts_event);
+			/* If previous timestamp different from last stored
+			 * timestamp then there is a good chance that the
+			 * current buffer is overwritten and therefore should
+			 * not be parsed.
+			 */
+			if (tracer->last_timestamp != last_block_timestamp) {
+				mlx5_core_warn(dev, "FWTracer: Events were lost\n");
+				tracer->last_timestamp = block_timestamp;
+				tracer->buff.consumer_index =
+					(tracer->buff.consumer_index + 1) & (block_count - 1);
+				break;
+			}
+		}
+
+		/* Parse events */
+		for (i = 0; i < TRACES_PER_BLOCK ; i++)
+			poll_trace(tracer, &tracer_event, &tmp_trace_block[i]);
+
+		tracer->buff.consumer_index =
+			(tracer->buff.consumer_index + 1) & (block_count - 1);
+
+		tracer->last_timestamp = block_timestamp;
+		start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE;
+		memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset,
+		       TRACER_BLOCK_SIZE_BYTE);
+		block_timestamp = get_block_timestamp(tracer,
+						      &tmp_trace_block[TRACES_PER_BLOCK - 1]);
 	}
 
+	mlx5_fw_tracer_arm(dev);
+}
+
+static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_conf)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_conf)] = {0};
+	int err;
+
+	MLX5_SET(mtrc_conf, in, trace_mode, TRACE_TO_MEMORY);
+	MLX5_SET(mtrc_conf, in, log_trace_buffer_size,
+		 ilog2(TRACER_BUFFER_PAGE_NUM));
+	MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey.key);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CONF, 0, 1);
+	if (err)
+		mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err);
+
+	return err;
+}
+
+static int mlx5_fw_tracer_set_mtrc_ctrl(struct mlx5_fw_tracer *tracer, u8 status, u8 arm)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	int err;
+
+	MLX5_SET(mtrc_ctrl, in, modify_field_select, TRACE_STATUS);
+	MLX5_SET(mtrc_ctrl, in, trace_status, status);
+	MLX5_SET(mtrc_ctrl, in, arm_event, arm);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CTRL, 0, 1);
+
+	if (!err && status)
+		tracer->last_timestamp = 0;
+
+	return err;
+}
+
+static int mlx5_fw_tracer_start(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	int err;
+
 	err = mlx5_fw_tracer_ownership_acquire(tracer);
 	if (err) {
 		mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err);
+		/* Don't fail since ownership can be acquired on a later FW event */
+		return 0;
+	}
+
+	err = mlx5_fw_tracer_set_mtrc_conf(tracer);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to set tracer configuration %d\n", err);
+		goto release_ownership;
+	}
+
+	/* enable tracer & trace events */
+	err = mlx5_fw_tracer_set_mtrc_ctrl(tracer, 1, 1);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to enable tracer %d\n", err);
+		goto release_ownership;
+	}
+
+	return 0;
+
+release_ownership:
+	mlx5_fw_tracer_ownership_release(tracer);
+	return err;
+}
+
+static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer =
+		container_of(work, struct mlx5_fw_tracer, ownership_change_work);
+
+	if (tracer->owner) {
+		tracer->owner = false;
+		tracer->buff.consumer_index = 0;
 		return;
 	}
+
+	mlx5_fw_tracer_start(tracer);
 }
 
+/* Create software resources (Buffers, etc ..) */
 struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fw_tracer *tracer = NULL;
@@ -361,6 +580,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
 
 	INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change);
 	INIT_WORK(&tracer->read_fw_strings_work, mlx5_tracer_read_strings_db);
+	INIT_WORK(&tracer->handle_traces_work, mlx5_fw_tracer_handle_traces);
+
 
 	err = mlx5_query_mtrc_caps(tracer);
 	if (err) {
@@ -392,6 +613,9 @@ free_tracer:
 	return ERR_PTR(err);
 }
 
+/* Create HW resources + start tracer
+ * must be called before Async EQ is created
+ */
 int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer)
 {
 	struct mlx5_core_dev *dev;
@@ -417,22 +641,25 @@ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer)
 		goto err_dealloc_pd;
 	}
 
-	err = mlx5_fw_tracer_ownership_acquire(tracer);
-	if (err) /* Don't fail since ownership can be acquired on a later FW event */
-		mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err);
+	mlx5_fw_tracer_start(tracer);
 
 	return 0;
+
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(dev, tracer->buff.pdn);
 	return err;
 }
 
+/* Stop tracer + Cleanup HW resources
+ * must be called after Async EQ is destroyed
+ */
 void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer)
 {
 	if (IS_ERR_OR_NULL(tracer))
 		return;
 
 	cancel_work_sync(&tracer->ownership_change_work);
+	cancel_work_sync(&tracer->handle_traces_work);
 
 	if (tracer->owner)
 		mlx5_fw_tracer_ownership_release(tracer);
@@ -441,6 +668,7 @@ void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer)
 	mlx5_core_dealloc_pd(tracer->dev, tracer->buff.pdn);
 }
 
+/* Free software resources (Buffers, etc ..) */
 void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
 {
 	if (IS_ERR_OR_NULL(tracer))
@@ -454,4 +682,26 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
 	kfree(tracer);
 }
 
+void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+	struct mlx5_fw_tracer *tracer = dev->tracer;
+
+	if (!tracer)
+		return;
+
+	switch (eqe->sub_type) {
+	case MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE:
+		if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state))
+			queue_work(tracer->work_queue, &tracer->ownership_change_work);
+		break;
+	case MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE:
+		if (likely(tracer->str_db.loaded))
+			queue_work(tracer->work_queue, &tracer->handle_traces_work);
+		break;
+	default:
+		mlx5_core_dbg(dev, "FWTracer: Event with unrecognized subtype: sub_type %d\n",
+			      eqe->sub_type);
+	}
+}
+
 EXPORT_TRACEPOINT_SYMBOL(mlx5_fw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
index 66cb7e7ada28..3915e91486b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
@@ -43,6 +43,9 @@
 #define TRACER_BUFFER_CHUNK 4096
 #define TRACE_BUFFER_SIZE_BYTE (TRACER_BUFFER_PAGE_NUM * TRACER_BUFFER_CHUNK)
 
+#define TRACER_BLOCK_SIZE_BYTE 256
+#define TRACES_PER_BLOCK 32
+
 struct mlx5_fw_tracer {
 	struct mlx5_core_dev *dev;
 	bool owner;
@@ -69,8 +72,11 @@ struct mlx5_fw_tracer {
 		dma_addr_t dma;
 		u32 size;
 		struct mlx5_core_mkey mkey;
-
+		u32 consumer_index;
 	} buff;
+
+	u64 last_timestamp;
+	struct work_struct handle_traces_work;
 };
 
 enum mlx5_fw_tracer_ownership_state {
@@ -78,7 +84,70 @@ enum mlx5_fw_tracer_ownership_state {
 	MLX5_FW_TRACER_ACQUIRE_OWNERSHIP,
 };
 
+enum tracer_ctrl_fields_select {
+	TRACE_STATUS = 1 << 0,
+};
+
+enum tracer_event_type {
+	TRACER_EVENT_TYPE_STRING,
+	TRACER_EVENT_TYPE_TIMESTAMP = 0xFF,
+	TRACER_EVENT_TYPE_UNRECOGNIZED,
+};
+
+enum tracing_mode {
+	TRACE_TO_MEMORY = 1 << 0,
+};
+
+struct tracer_timestamp_event {
+	u64        timestamp;
+	u8         unreliable;
+};
+
+struct tracer_string_event {
+	u32        timestamp;
+	u32        tmsn;
+	u32        tdsn;
+	u32        string_param;
+};
+
+struct tracer_event {
+	bool      lost_event;
+	u32       type;
+	u8        event_id;
+	union {
+		struct tracer_string_event string_event;
+		struct tracer_timestamp_event timestamp_event;
+	};
+};
+
+struct mlx5_ifc_tracer_event_bits {
+	u8         lost[0x1];
+	u8         timestamp[0x7];
+	u8         event_id[0x8];
+	u8         event_data[0x30];
+};
+
+struct mlx5_ifc_tracer_string_event_bits {
+	u8         lost[0x1];
+	u8         timestamp[0x7];
+	u8         event_id[0x8];
+	u8         tmsn[0xd];
+	u8         tdsn[0x3];
+	u8         string_param[0x20];
+};
+
+struct mlx5_ifc_tracer_timestamp_event_bits {
+	u8         timestamp7_0[0x8];
+	u8         event_id[0x8];
+	u8         urts[0x3];
+	u8         timestamp52_40[0xd];
+	u8         timestamp39_8[0x20];
+};
+
 struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev);
+int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer);
+void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer);
 void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer);
+void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) { return; }
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 406c23862f5f..7669b4380779 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -40,6 +40,7 @@
 #include "mlx5_core.h"
 #include "fpga/core.h"
 #include "eswitch.h"
+#include "diag/fw_tracer.h"
 
 enum {
 	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
@@ -168,6 +169,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
 	case MLX5_EVENT_TYPE_GENERAL_EVENT:
 		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
+	case MLX5_EVENT_TYPE_DEVICE_TRACER:
+		return "MLX5_EVENT_TYPE_DEVICE_TRACER";
 	default:
 		return "Unrecognized event";
 	}
@@ -576,6 +579,11 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 		case MLX5_EVENT_TYPE_GENERAL_EVENT:
 			general_event_handler(dev, eqe);
 			break;
+
+		case MLX5_EVENT_TYPE_DEVICE_TRACER:
+			mlx5_fw_tracer_event(dev, eqe);
+			break;
+
 		default:
 			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
 				       eqe->type, eq->eqn);
@@ -853,6 +861,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, temp_warn_event))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
+	if (MLX5_CAP_MCAM_REG(dev, tracer_registers))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DEVICE_TRACER);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0566c6a94805..d489494b0a84 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -332,6 +332,13 @@ enum mlx5_event {
 
 	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
 	MLX5_EVENT_TYPE_FPGA_QP_ERROR      = 0x21,
+
+	MLX5_EVENT_TYPE_DEVICE_TRACER      = 0x26,
+};
+
+enum {
+	MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE = 0x0,
+	MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE = 0x1,
 };
 
 enum {
-- 
cgit v1.2.3


From 51bbf9bee34ff5d4006d266f24a54dc9c1669eb5 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Thu, 19 Jul 2018 17:27:43 -0500
Subject: PCI: hotplug: Demidlayer registration with the core

When a hotplug driver calls pci_hp_register(), all steps necessary for
registration are carried out in one go, including creation of a kobject
and addition to sysfs.  That's a problem for pciehp once it's converted
to enable/disable the slot exclusively from the IRQ thread:  The thread
needs to be spawned after creation of the kobject (because it uses the
kobject's name), but before addition to sysfs (because it will handle
enable/disable requests submitted via sysfs).

pci_hp_deregister() does offer a ->release callback that's invoked
after deletion from sysfs and before destruction of the kobject.  But
because pci_hp_register() doesn't offer a counterpart, hotplug drivers'
->probe and ->remove code becomes asymmetric, which is error prone
as recently discovered use-after-free bugs in pciehp's ->remove hook
have shown.

In a sense, this appears to be a case of the midlayer antipattern:

   "The core thesis of the "midlayer mistake" is that midlayers are
    bad and should not exist.  That common functionality which it is
    so tempting to put in a midlayer should instead be provided as
    library routines which can [be] used, augmented, or ignored by
    each bottom level driver independently.  Thus every subsystem
    that supports multiple implementations (or drivers) should
    provide a very thin top layer which calls directly into the
    bottom layer drivers, and a rich library of support code that
    eases the implementation of those drivers.  This library is
    available to, but not forced upon, those drivers."
        --  Neil Brown (2009), https://lwn.net/Articles/336262/

The presence of midlayer traits in the PCI hotplug core might be ascribed
to its age:  When it was introduced in February 2002, the blessings of a
library approach might not have been well known:
https://git.kernel.org/tglx/history/c/a8a2069f432c

For comparison, the driver core does offer split functions for creating
a kobject (device_initialize()) and addition to sysfs (device_add()) as
an alternative to carrying out everything at once (device_register()).
This was introduced in October 2002:
https://git.kernel.org/tglx/history/c/8b290eb19962

The odd ->release callback in the PCI hotplug core was added in 2003:
https://git.kernel.org/tglx/history/c/69f8d663b595

Clearly, a library approach would not force every hotplug driver to
implement a ->release callback, but rather allow the driver to remove
the sysfs files, release its data structures and finally destroy the
kobject.  Alternatively, a driver may choose to remove everything with
pci_hp_deregister(), then release its data structures.

To this end, offer drivers pci_hp_initialize() and pci_hp_add() as a
split-up version of pci_hp_register().  Likewise, offer pci_hp_del()
and pci_hp_destroy() as a split-up version of pci_hp_deregister().

Eliminate the ->release callback and move its code into each driver's
teardown routine.

Declare pci_hp_deregister() void, in keeping with the usual kernel
pattern that enablement can fail, but disablement cannot.  It only
returned an error if the caller passed in a NULL pointer or a slot which
has never or is no longer registered or is sharing its name with another
slot.  Those would be bugs, so WARN about them.  Few hotplug drivers
actually checked the return value and those that did only printed a
useless error message to dmesg.  Remove that.

For most drivers the conversion was straightforward since it doesn't
matter whether the code in the ->release callback is executed before or
after destruction of the kobject.  But in the case of ibmphp, it was
unclear to me whether setting slot_cur->ctrl and slot_cur->bus_on to
NULL needs to happen before the kobject is destroyed, so I erred on
the side of caution and ensured that the order stays the same.  Another
nontrivial case is pnv_php, I've found the list and kref logic difficult
to understand, however my impression was that it is safe to delete the
list element and drop the references until after the kobject is
destroyed.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>  # drivers/platform/x86
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Scott Murray <scott@spiteful.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Gavin Shan <gwshan@linux.vnet.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Corentin Chary <corentin.chary@gmail.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Andy Shevchenko <andy@infradead.org>
---
 drivers/pci/hotplug/acpiphp_core.c      |  22 +----
 drivers/pci/hotplug/cpci_hotplug_core.c |  14 +---
 drivers/pci/hotplug/cpqphp_core.c       |  16 +---
 drivers/pci/hotplug/ibmphp_core.c       |  15 +++-
 drivers/pci/hotplug/ibmphp_ebda.c       |  20 -----
 drivers/pci/hotplug/pci_hotplug_core.c  | 139 +++++++++++++++++++++++---------
 drivers/pci/hotplug/pciehp_core.c       |  19 ++---
 drivers/pci/hotplug/pnv_php.c           |   5 +-
 drivers/pci/hotplug/rpaphp_core.c       |   2 +-
 drivers/pci/hotplug/rpaphp_slot.c       |  13 +--
 drivers/pci/hotplug/s390_pci_hpc.c      |  13 +--
 drivers/pci/hotplug/sgi_hotplug.c       |   9 ++-
 drivers/pci/hotplug/shpchp_core.c       |  20 +----
 drivers/platform/x86/asus-wmi.c         |  12 +--
 drivers/platform/x86/eeepc-laptop.c     |  12 +--
 include/linux/pci_hotplug.h             |  15 +++-
 16 files changed, 168 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index 12b5655fd390..ad32ffbc4b91 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -254,20 +254,6 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	return 0;
 }
 
-/**
- * release_slot - free up the memory used by a slot
- * @hotplug_slot: slot to free
- */
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot = hotplug_slot->private;
-
-	pr_debug("%s - physical_slot = %s\n", __func__, slot_name(slot));
-
-	kfree(slot->hotplug_slot);
-	kfree(slot);
-}
-
 /* callback routine to initialize 'struct slot' for each slot */
 int acpiphp_register_hotplug_slot(struct acpiphp_slot *acpiphp_slot,
 				  unsigned int sun)
@@ -287,7 +273,6 @@ int acpiphp_register_hotplug_slot(struct acpiphp_slot *acpiphp_slot,
 	slot->hotplug_slot->info = &slot->info;
 
 	slot->hotplug_slot->private = slot;
-	slot->hotplug_slot->release = &release_slot;
 	slot->hotplug_slot->ops = &acpi_hotplug_slot_ops;
 
 	slot->acpi_slot = acpiphp_slot;
@@ -324,13 +309,12 @@ error:
 void acpiphp_unregister_hotplug_slot(struct acpiphp_slot *acpiphp_slot)
 {
 	struct slot *slot = acpiphp_slot->slot;
-	int retval = 0;
 
 	pr_info("Slot [%s] unregistered\n", slot_name(slot));
 
-	retval = pci_hp_deregister(slot->hotplug_slot);
-	if (retval)
-		pr_err("pci_hp_deregister failed with error %d\n", retval);
+	pci_hp_deregister(slot->hotplug_slot);
+	kfree(slot->hotplug_slot);
+	kfree(slot);
 }
 
 
diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 07b533adc9df..52a339baf06c 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -195,10 +195,8 @@ get_latch_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	return 0;
 }
 
-static void release_slot(struct hotplug_slot *hotplug_slot)
+static void release_slot(struct slot *slot)
 {
-	struct slot *slot = hotplug_slot->private;
-
 	kfree(slot->hotplug_slot->info);
 	kfree(slot->hotplug_slot);
 	pci_dev_put(slot->dev);
@@ -253,7 +251,6 @@ cpci_hp_register_bus(struct pci_bus *bus, u8 first, u8 last)
 		snprintf(name, SLOT_NAME_SIZE, "%02x:%02x", bus->number, i);
 
 		hotplug_slot->private = slot;
-		hotplug_slot->release = &release_slot;
 		hotplug_slot->ops = &cpci_hotplug_slot_ops;
 
 		/*
@@ -308,12 +305,8 @@ cpci_hp_unregister_bus(struct pci_bus *bus)
 			slots--;
 
 			dbg("deregistering slot %s", slot_name(slot));
-			status = pci_hp_deregister(slot->hotplug_slot);
-			if (status) {
-				err("pci_hp_deregister failed with error %d",
-				    status);
-				break;
-			}
+			pci_hp_deregister(slot->hotplug_slot);
+			release_slot(slot);
 		}
 	}
 	up_write(&list_rwsem);
@@ -623,6 +616,7 @@ cleanup_slots(void)
 	list_for_each_entry_safe(slot, tmp, &slot_list, slot_list) {
 		list_del(&slot->slot_list);
 		pci_hp_deregister(slot->hotplug_slot);
+		release_slot(slot);
 	}
 cleanup_null:
 	up_write(&list_rwsem);
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index 1797e36ec586..5a06636e910a 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -266,17 +266,6 @@ static void __iomem *get_SMBIOS_entry(void __iomem *smbios_start,
 	return previous;
 }
 
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot = hotplug_slot->private;
-
-	dbg("%s - physical_slot = %s\n", __func__, slot_name(slot));
-
-	kfree(slot->hotplug_slot->info);
-	kfree(slot->hotplug_slot);
-	kfree(slot);
-}
-
 static int ctrl_slot_cleanup(struct controller *ctrl)
 {
 	struct slot *old_slot, *next_slot;
@@ -285,9 +274,11 @@ static int ctrl_slot_cleanup(struct controller *ctrl)
 	ctrl->slot = NULL;
 
 	while (old_slot) {
-		/* memory will be freed by the release_slot callback */
 		next_slot = old_slot->next;
 		pci_hp_deregister(old_slot->hotplug_slot);
+		kfree(old_slot->hotplug_slot->info);
+		kfree(old_slot->hotplug_slot);
+		kfree(old_slot);
 		old_slot = next_slot;
 	}
 
@@ -678,7 +669,6 @@ static int ctrl_slot_setup(struct controller *ctrl,
 			((read_slot_enable(ctrl) << 2) >> ctrl_slot) & 0x04;
 
 		/* register this slot with the hotplug pci core */
-		hotplug_slot->release = &release_slot;
 		hotplug_slot->private = slot;
 		snprintf(name, SLOT_NAME_SIZE, "%u", slot->number);
 		hotplug_slot->ops = &cpqphp_hotplug_slot_ops;
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index 1869b0411ce0..4ea57e9019f1 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -673,7 +673,20 @@ static void free_slots(void)
 
 	list_for_each_entry_safe(slot_cur, next, &ibmphp_slot_head,
 				 ibm_slot_list) {
-		pci_hp_deregister(slot_cur->hotplug_slot);
+		pci_hp_del(slot_cur->hotplug_slot);
+		slot_cur->ctrl = NULL;
+		slot_cur->bus_on = NULL;
+
+		/*
+		 * We don't want to actually remove the resources,
+		 * since ibmphp_free_resources() will do just that.
+		 */
+		ibmphp_unconfigure_card(&slot_cur, -1);
+
+		pci_hp_destroy(slot_cur->hotplug_slot);
+		kfree(slot_cur->hotplug_slot->info);
+		kfree(slot_cur->hotplug_slot);
+		kfree(slot_cur);
 	}
 	debug("%s -- exit\n", __func__);
 }
diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c
index 64549aa24c0f..6f8e90e3ec08 100644
--- a/drivers/pci/hotplug/ibmphp_ebda.c
+++ b/drivers/pci/hotplug/ibmphp_ebda.c
@@ -699,25 +699,6 @@ static int fillslotinfo(struct hotplug_slot *hotplug_slot)
 	return rc;
 }
 
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot;
-
-	if (!hotplug_slot || !hotplug_slot->private)
-		return;
-
-	slot = hotplug_slot->private;
-	kfree(slot->hotplug_slot->info);
-	kfree(slot->hotplug_slot);
-	slot->ctrl = NULL;
-	slot->bus_on = NULL;
-
-	/* we don't want to actually remove the resources, since free_resources will do just that */
-	ibmphp_unconfigure_card(&slot, -1);
-
-	kfree(slot);
-}
-
 static struct pci_driver ibmphp_driver;
 
 /*
@@ -941,7 +922,6 @@ static int __init ebda_rsrc_controller(void)
 			tmp_slot->hotplug_slot = hp_slot_ptr;
 
 			hp_slot_ptr->private = tmp_slot;
-			hp_slot_ptr->release = release_slot;
 
 			rc = fillslotinfo(hp_slot_ptr);
 			if (rc)
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index fd93783a87b0..90fde5f106d8 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -396,8 +396,9 @@ static struct hotplug_slot *get_slot_from_name(const char *name)
  * @owner: caller module owner
  * @mod_name: caller module name
  *
- * Registers a hotplug slot with the pci hotplug subsystem, which will allow
- * userspace interaction to the slot.
+ * Prepares a hotplug slot for in-kernel use and immediately publishes it to
+ * user space in one go.  Drivers may alternatively carry out the two steps
+ * separately by invoking pci_hp_initialize() and pci_hp_add().
  *
  * Returns 0 if successful, anything else for an error.
  */
@@ -406,54 +407,91 @@ int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *bus,
 		      struct module *owner, const char *mod_name)
 {
 	int result;
+
+	result = __pci_hp_initialize(slot, bus, devnr, name, owner, mod_name);
+	if (result)
+		return result;
+
+	result = pci_hp_add(slot);
+	if (result)
+		pci_hp_destroy(slot);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(__pci_hp_register);
+
+/**
+ * __pci_hp_initialize - prepare hotplug slot for in-kernel use
+ * @slot: pointer to the &struct hotplug_slot to initialize
+ * @bus: bus this slot is on
+ * @devnr: slot number
+ * @name: name registered with kobject core
+ * @owner: caller module owner
+ * @mod_name: caller module name
+ *
+ * Allocate and fill in a PCI slot for use by a hotplug driver.  Once this has
+ * been called, the driver may invoke hotplug_slot_name() to get the slot's
+ * unique name.  The driver must be prepared to handle a ->reset_slot callback
+ * from this point on.
+ *
+ * Returns 0 on success or a negative int on error.
+ */
+int __pci_hp_initialize(struct hotplug_slot *slot, struct pci_bus *bus,
+			int devnr, const char *name, struct module *owner,
+			const char *mod_name)
+{
 	struct pci_slot *pci_slot;
 
 	if (slot == NULL)
 		return -ENODEV;
 	if ((slot->info == NULL) || (slot->ops == NULL))
 		return -EINVAL;
-	if (slot->release == NULL) {
-		dbg("Why are you trying to register a hotplug slot without a proper release function?\n");
-		return -EINVAL;
-	}
 
 	slot->ops->owner = owner;
 	slot->ops->mod_name = mod_name;
 
-	mutex_lock(&pci_hp_mutex);
 	/*
 	 * No problems if we call this interface from both ACPI_PCI_SLOT
 	 * driver and call it here again. If we've already created the
 	 * pci_slot, the interface will simply bump the refcount.
 	 */
 	pci_slot = pci_create_slot(bus, devnr, name, slot);
-	if (IS_ERR(pci_slot)) {
-		result = PTR_ERR(pci_slot);
-		goto out;
-	}
+	if (IS_ERR(pci_slot))
+		return PTR_ERR(pci_slot);
 
 	slot->pci_slot = pci_slot;
 	pci_slot->hotplug = slot;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__pci_hp_initialize);
 
-	list_add(&slot->slot_list, &pci_hotplug_slot_list);
+/**
+ * pci_hp_add - publish hotplug slot to user space
+ * @slot: pointer to the &struct hotplug_slot to publish
+ *
+ * Make a hotplug slot's sysfs interface available and inform user space of its
+ * addition by sending a uevent.  The hotplug driver must be prepared to handle
+ * all &struct hotplug_slot_ops callbacks from this point on.
+ *
+ * Returns 0 on success or a negative int on error.
+ */
+int pci_hp_add(struct hotplug_slot *slot)
+{
+	struct pci_slot *pci_slot = slot->pci_slot;
+	int result;
 
 	result = fs_add_slot(pci_slot);
 	if (result)
-		goto err_list_del;
+		return result;
 
 	kobject_uevent(&pci_slot->kobj, KOBJ_ADD);
-	dbg("Added slot %s to the list\n", name);
-	goto out;
-
-err_list_del:
-	list_del(&slot->slot_list);
-	pci_slot->hotplug = NULL;
-	pci_destroy_slot(pci_slot);
-out:
+	mutex_lock(&pci_hp_mutex);
+	list_add(&slot->slot_list, &pci_hotplug_slot_list);
 	mutex_unlock(&pci_hp_mutex);
-	return result;
+	dbg("Added slot %s to the list\n", hotplug_slot_name(slot));
+	return 0;
 }
-EXPORT_SYMBOL_GPL(__pci_hp_register);
+EXPORT_SYMBOL_GPL(pci_hp_add);
 
 /**
  * pci_hp_deregister - deregister a hotplug_slot with the PCI hotplug subsystem
@@ -464,35 +502,62 @@ EXPORT_SYMBOL_GPL(__pci_hp_register);
  *
  * Returns 0 if successful, anything else for an error.
  */
-int pci_hp_deregister(struct hotplug_slot *slot)
+void pci_hp_deregister(struct hotplug_slot *slot)
+{
+	pci_hp_del(slot);
+	pci_hp_destroy(slot);
+}
+EXPORT_SYMBOL_GPL(pci_hp_deregister);
+
+/**
+ * pci_hp_del - unpublish hotplug slot from user space
+ * @slot: pointer to the &struct hotplug_slot to unpublish
+ *
+ * Remove a hotplug slot's sysfs interface.
+ *
+ * Returns 0 on success or a negative int on error.
+ */
+void pci_hp_del(struct hotplug_slot *slot)
 {
 	struct hotplug_slot *temp;
-	struct pci_slot *pci_slot;
 
-	if (!slot)
-		return -ENODEV;
+	if (WARN_ON(!slot))
+		return;
 
 	mutex_lock(&pci_hp_mutex);
 	temp = get_slot_from_name(hotplug_slot_name(slot));
-	if (temp != slot) {
+	if (WARN_ON(temp != slot)) {
 		mutex_unlock(&pci_hp_mutex);
-		return -ENODEV;
+		return;
 	}
 
 	list_del(&slot->slot_list);
-
-	pci_slot = slot->pci_slot;
-	fs_remove_slot(pci_slot);
+	mutex_unlock(&pci_hp_mutex);
 	dbg("Removed slot %s from the list\n", hotplug_slot_name(slot));
+	fs_remove_slot(slot->pci_slot);
+}
+EXPORT_SYMBOL_GPL(pci_hp_del);
+
+/**
+ * pci_hp_destroy - remove hotplug slot from in-kernel use
+ * @slot: pointer to the &struct hotplug_slot to destroy
+ *
+ * Destroy a PCI slot used by a hotplug driver.  Once this has been called,
+ * the driver may no longer invoke hotplug_slot_name() to get the slot's
+ * unique name.  The driver no longer needs to handle a ->reset_slot callback
+ * from this point on.
+ *
+ * Returns 0 on success or a negative int on error.
+ */
+void pci_hp_destroy(struct hotplug_slot *slot)
+{
+	struct pci_slot *pci_slot = slot->pci_slot;
 
-	slot->release(slot);
+	slot->pci_slot = NULL;
 	pci_slot->hotplug = NULL;
 	pci_destroy_slot(pci_slot);
-	mutex_unlock(&pci_hp_mutex);
-
-	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_hp_deregister);
+EXPORT_SYMBOL_GPL(pci_hp_destroy);
 
 /**
  * pci_hp_change_slot_info - changes the slot's information structure in the core
diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index b360645377c2..37d8f81e548f 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -56,17 +56,6 @@ static int get_latch_status(struct hotplug_slot *slot, u8 *value);
 static int get_adapter_status(struct hotplug_slot *slot, u8 *value);
 static int reset_slot(struct hotplug_slot *slot, int probe);
 
-/**
- * release_slot - free up the memory used by a slot
- * @hotplug_slot: slot to free
- */
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	kfree(hotplug_slot->ops);
-	kfree(hotplug_slot->info);
-	kfree(hotplug_slot);
-}
-
 static int init_slot(struct controller *ctrl)
 {
 	struct slot *slot = ctrl->slot;
@@ -107,7 +96,6 @@ static int init_slot(struct controller *ctrl)
 	/* register this slot with the hotplug pci core */
 	hotplug->info = info;
 	hotplug->private = slot;
-	hotplug->release = &release_slot;
 	hotplug->ops = ops;
 	slot->hotplug_slot = hotplug;
 	snprintf(name, SLOT_NAME_SIZE, "%u", PSN(ctrl));
@@ -127,7 +115,12 @@ out:
 
 static void cleanup_slot(struct controller *ctrl)
 {
-	pci_hp_deregister(ctrl->slot->hotplug_slot);
+	struct hotplug_slot *hotplug_slot = ctrl->slot->hotplug_slot;
+
+	pci_hp_deregister(hotplug_slot);
+	kfree(hotplug_slot->ops);
+	kfree(hotplug_slot->info);
+	kfree(hotplug_slot);
 }
 
 /*
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6c2e8d7307c6..3276a5e4c430 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -538,9 +538,8 @@ static struct hotplug_slot_ops php_slot_ops = {
 	.disable_slot		= pnv_php_disable_slot,
 };
 
-static void pnv_php_release(struct hotplug_slot *slot)
+static void pnv_php_release(struct pnv_php_slot *php_slot)
 {
-	struct pnv_php_slot *php_slot = slot->private;
 	unsigned long flags;
 
 	/* Remove from global or child list */
@@ -596,7 +595,6 @@ static struct pnv_php_slot *pnv_php_alloc_slot(struct device_node *dn)
 	php_slot->power_state_check     = false;
 	php_slot->slot.ops              = &php_slot_ops;
 	php_slot->slot.info             = &php_slot->slot_info;
-	php_slot->slot.release          = pnv_php_release;
 	php_slot->slot.private          = php_slot;
 
 	INIT_LIST_HEAD(&php_slot->children);
@@ -924,6 +922,7 @@ static void pnv_php_unregister_one(struct device_node *dn)
 
 	php_slot->state = PNV_PHP_STATE_OFFLINE;
 	pci_hp_deregister(&php_slot->slot);
+	pnv_php_release(php_slot);
 	pnv_php_put_slot(php_slot);
 }
 
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index fb5e0845429d..857c358b727b 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -404,13 +404,13 @@ static void __exit cleanup_slots(void)
 	/*
 	 * Unregister all of our slots with the pci_hotplug subsystem,
 	 * and free up all memory that we had allocated.
-	 * memory will be freed in release_slot callback.
 	 */
 
 	list_for_each_entry_safe(slot, next, &rpaphp_slot_head,
 				 rpaphp_slot_list) {
 		list_del(&slot->rpaphp_slot_list);
 		pci_hp_deregister(slot->hotplug_slot);
+		dealloc_slot_struct(slot);
 	}
 	return;
 }
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index 3840a2075e6a..b916c8e4372d 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -19,12 +19,6 @@
 #include "rpaphp.h"
 
 /* free up the memory used by a slot */
-static void rpaphp_release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot = (struct slot *) hotplug_slot->private;
-	dealloc_slot_struct(slot);
-}
-
 void dealloc_slot_struct(struct slot *slot)
 {
 	kfree(slot->hotplug_slot->info);
@@ -56,7 +50,6 @@ struct slot *alloc_slot_struct(struct device_node *dn,
 	slot->power_domain = power_domain;
 	slot->hotplug_slot->private = slot;
 	slot->hotplug_slot->ops = &rpaphp_hotplug_slot_ops;
-	slot->hotplug_slot->release = &rpaphp_release_slot;
 
 	return (slot);
 
@@ -90,10 +83,8 @@ int rpaphp_deregister_slot(struct slot *slot)
 		__func__, slot->name);
 
 	list_del(&slot->rpaphp_slot_list);
-
-	retval = pci_hp_deregister(php_slot);
-	if (retval)
-		err("Problem unregistering a slot %s\n", slot->name);
+	pci_hp_deregister(php_slot);
+	dealloc_slot_struct(slot);
 
 	dbg("%s - Exit: rc[%d]\n", __func__, retval);
 	return retval;
diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c
index ffdc2977395d..93b5341d282c 100644
--- a/drivers/pci/hotplug/s390_pci_hpc.c
+++ b/drivers/pci/hotplug/s390_pci_hpc.c
@@ -130,15 +130,6 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value)
 	return 0;
 }
 
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot = hotplug_slot->private;
-
-	kfree(slot->hotplug_slot->info);
-	kfree(slot->hotplug_slot);
-	kfree(slot);
-}
-
 static struct hotplug_slot_ops s390_hotplug_slot_ops = {
 	.enable_slot =		enable_slot,
 	.disable_slot =		disable_slot,
@@ -175,7 +166,6 @@ int zpci_init_slot(struct zpci_dev *zdev)
 	hotplug_slot->info = info;
 
 	hotplug_slot->ops = &s390_hotplug_slot_ops;
-	hotplug_slot->release = &release_slot;
 
 	get_power_status(hotplug_slot, &info->power_status);
 	get_adapter_status(hotplug_slot, &info->adapter_status);
@@ -209,5 +199,8 @@ void zpci_exit_slot(struct zpci_dev *zdev)
 			continue;
 		list_del(&slot->slot_list);
 		pci_hp_deregister(slot->hotplug_slot);
+		kfree(slot->hotplug_slot->info);
+		kfree(slot->hotplug_slot);
+		kfree(slot);
 	}
 }
diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c
index 78b6bdbb3a39..babd23409f61 100644
--- a/drivers/pci/hotplug/sgi_hotplug.c
+++ b/drivers/pci/hotplug/sgi_hotplug.c
@@ -628,7 +628,6 @@ static int sn_hotplug_slot_register(struct pci_bus *pci_bus)
 			goto alloc_err;
 		}
 		bss_hotplug_slot->ops = &sn_hotplug_slot_ops;
-		bss_hotplug_slot->release = &sn_release_slot;
 
 		rc = pci_hp_register(bss_hotplug_slot, pci_bus, device, name);
 		if (rc)
@@ -656,8 +655,10 @@ alloc_err:
 		sn_release_slot(bss_hotplug_slot);
 
 	/* destroy anything else on the list */
-	while ((bss_hotplug_slot = sn_hp_destroy()))
+	while ((bss_hotplug_slot = sn_hp_destroy())) {
 		pci_hp_deregister(bss_hotplug_slot);
+		sn_release_slot(bss_hotplug_slot);
+	}
 
 	return rc;
 }
@@ -703,8 +704,10 @@ static void __exit sn_pci_hotplug_exit(void)
 {
 	struct hotplug_slot *bss_hotplug_slot;
 
-	while ((bss_hotplug_slot = sn_hp_destroy()))
+	while ((bss_hotplug_slot = sn_hp_destroy())) {
 		pci_hp_deregister(bss_hotplug_slot);
+		sn_release_slot(bss_hotplug_slot);
+	}
 
 	if (!list_empty(&sn_hp_list))
 		printk(KERN_ERR "%s: internal list is not empty\n", __FILE__);
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index 8e3c6ce12f31..97cee23f3d51 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -61,22 +61,6 @@ static struct hotplug_slot_ops shpchp_hotplug_slot_ops = {
 	.get_adapter_status =	get_adapter_status,
 };
 
-/**
- * release_slot - free up the memory used by a slot
- * @hotplug_slot: slot to free
- */
-static void release_slot(struct hotplug_slot *hotplug_slot)
-{
-	struct slot *slot = hotplug_slot->private;
-
-	ctrl_dbg(slot->ctrl, "%s: physical_slot = %s\n",
-		 __func__, slot_name(slot));
-
-	kfree(slot->hotplug_slot->info);
-	kfree(slot->hotplug_slot);
-	kfree(slot);
-}
-
 static int init_slots(struct controller *ctrl)
 {
 	struct slot *slot;
@@ -125,7 +109,6 @@ static int init_slots(struct controller *ctrl)
 
 		/* register this slot with the hotplug pci core */
 		hotplug_slot->private = slot;
-		hotplug_slot->release = &release_slot;
 		snprintf(name, SLOT_NAME_SIZE, "%d", slot->number);
 		hotplug_slot->ops = &shpchp_hotplug_slot_ops;
 
@@ -171,6 +154,9 @@ void cleanup_slots(struct controller *ctrl)
 		cancel_delayed_work(&slot->work);
 		destroy_workqueue(slot->wq);
 		pci_hp_deregister(slot->hotplug_slot);
+		kfree(slot->hotplug_slot->info);
+		kfree(slot->hotplug_slot);
+		kfree(slot);
 	}
 }
 
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 3d523ca64694..d67f32a29bb4 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -858,12 +858,6 @@ static int asus_get_adapter_status(struct hotplug_slot *hotplug_slot,
 	return 0;
 }
 
-static void asus_cleanup_pci_hotplug(struct hotplug_slot *hotplug_slot)
-{
-	kfree(hotplug_slot->info);
-	kfree(hotplug_slot);
-}
-
 static struct hotplug_slot_ops asus_hotplug_slot_ops = {
 	.owner = THIS_MODULE,
 	.get_adapter_status = asus_get_adapter_status,
@@ -905,7 +899,6 @@ static int asus_setup_pci_hotplug(struct asus_wmi *asus)
 		goto error_info;
 
 	asus->hotplug_slot->private = asus;
-	asus->hotplug_slot->release = &asus_cleanup_pci_hotplug;
 	asus->hotplug_slot->ops = &asus_hotplug_slot_ops;
 	asus_get_adapter_status(asus->hotplug_slot,
 				&asus->hotplug_slot->info->adapter_status);
@@ -1051,8 +1044,11 @@ static void asus_wmi_rfkill_exit(struct asus_wmi *asus)
 	 * asus_unregister_rfkill_notifier()
 	 */
 	asus_rfkill_hotplug(asus);
-	if (asus->hotplug_slot)
+	if (asus->hotplug_slot) {
 		pci_hp_deregister(asus->hotplug_slot);
+		kfree(asus->hotplug_slot->info);
+		kfree(asus->hotplug_slot);
+	}
 	if (asus->hotplug_workqueue)
 		destroy_workqueue(asus->hotplug_workqueue);
 
diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 4c38904a8a32..a4bbf6ecd1f0 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -726,12 +726,6 @@ static int eeepc_get_adapter_status(struct hotplug_slot *hotplug_slot,
 	return 0;
 }
 
-static void eeepc_cleanup_pci_hotplug(struct hotplug_slot *hotplug_slot)
-{
-	kfree(hotplug_slot->info);
-	kfree(hotplug_slot);
-}
-
 static struct hotplug_slot_ops eeepc_hotplug_slot_ops = {
 	.owner = THIS_MODULE,
 	.get_adapter_status = eeepc_get_adapter_status,
@@ -758,7 +752,6 @@ static int eeepc_setup_pci_hotplug(struct eeepc_laptop *eeepc)
 		goto error_info;
 
 	eeepc->hotplug_slot->private = eeepc;
-	eeepc->hotplug_slot->release = &eeepc_cleanup_pci_hotplug;
 	eeepc->hotplug_slot->ops = &eeepc_hotplug_slot_ops;
 	eeepc_get_adapter_status(eeepc->hotplug_slot,
 				 &eeepc->hotplug_slot->info->adapter_status);
@@ -837,8 +830,11 @@ static void eeepc_rfkill_exit(struct eeepc_laptop *eeepc)
 		eeepc->wlan_rfkill = NULL;
 	}
 
-	if (eeepc->hotplug_slot)
+	if (eeepc->hotplug_slot) {
 		pci_hp_deregister(eeepc->hotplug_slot);
+		kfree(eeepc->hotplug_slot->info);
+		kfree(eeepc->hotplug_slot);
+	}
 
 	if (eeepc->bluetooth_rfkill) {
 		rfkill_unregister(eeepc->bluetooth_rfkill);
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index cf5e22103f68..a6d6650a0490 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -80,15 +80,12 @@ struct hotplug_slot_info {
  * @ops: pointer to the &struct hotplug_slot_ops to be used for this slot
  * @info: pointer to the &struct hotplug_slot_info for the initial values for
  * this slot.
- * @release: called during pci_hp_deregister to free memory allocated in a
- * hotplug_slot structure.
  * @private: used by the hotplug pci controller driver to store whatever it
  * needs.
  */
 struct hotplug_slot {
 	struct hotplug_slot_ops		*ops;
 	struct hotplug_slot_info	*info;
-	void (*release) (struct hotplug_slot *slot);
 	void				*private;
 
 	/* Variables below this are for use only by the hotplug pci core. */
@@ -104,13 +101,23 @@ static inline const char *hotplug_slot_name(const struct hotplug_slot *slot)
 int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *pbus, int nr,
 		      const char *name, struct module *owner,
 		      const char *mod_name);
-int pci_hp_deregister(struct hotplug_slot *slot);
+int __pci_hp_initialize(struct hotplug_slot *slot, struct pci_bus *bus, int nr,
+			const char *name, struct module *owner,
+			const char *mod_name);
+int pci_hp_add(struct hotplug_slot *slot);
+
+void pci_hp_del(struct hotplug_slot *slot);
+void pci_hp_destroy(struct hotplug_slot *slot);
+void pci_hp_deregister(struct hotplug_slot *slot);
+
 int __must_check pci_hp_change_slot_info(struct hotplug_slot *slot,
 					 struct hotplug_slot_info *info);
 
 /* use a define to avoid include chaining to get THIS_MODULE & friends */
 #define pci_hp_register(slot, pbus, devnr, name) \
 	__pci_hp_register(slot, pbus, devnr, name, THIS_MODULE, KBUILD_MODNAME)
+#define pci_hp_initialize(slot, bus, nr, name) \
+	__pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME)
 
 /* PCI Setting Record (Type 0) */
 struct hpp_type0 {
-- 
cgit v1.2.3


From 3730cf4dd70b6a36e48d58a862120311411b77f5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 24 Jul 2018 12:47:56 +0200
Subject: netlink: do not store start function in netlink_cb

->start() is called once when dump is being initialized, there is no
need to store it in netlink_cb.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 1 -
 net/netlink/af_netlink.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f3075d6c7e82..71f121b66ca8 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -170,7 +170,6 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 struct netlink_callback {
 	struct sk_buff		*skb;
 	const struct nlmsghdr	*nlh;
-	int			(*start)(struct netlink_callback *);
 	int			(*dump)(struct sk_buff * skb,
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 393573a99a5a..f6ac7693d2cc 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2300,7 +2300,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 
 	cb = &nlk->cb;
 	memset(cb, 0, sizeof(*cb));
-	cb->start = control->start;
 	cb->dump = control->dump;
 	cb->done = control->done;
 	cb->nlh = nlh;
@@ -2309,8 +2308,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->min_dump_alloc = control->min_dump_alloc;
 	cb->skb = skb;
 
-	if (cb->start) {
-		ret = cb->start(cb);
+	if (control->start) {
+		ret = control->start(cb);
 		if (ret)
 			goto error_put;
 	}
-- 
cgit v1.2.3


From e5cda42c16d89720c29678f51d95a119490ef7d8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Mon, 23 Jul 2018 19:52:57 +0200
Subject: ARM: exynos: Define EINT_WAKEUP_MASK registers for S5Pv210 and
 Exynos5433

S5Pv210 and Exynos5433/Exynos7 have different address of
EINT_WAKEUP_MASK register.  Rename existing S5P_EINT_WAKEUP_MASK to
avoid confusion and add new ones.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Tomasz Figa <tomasz.figa@gmail.com>
Cc: Sylwester Nawrocki <snawrocki@kernel.org>
Acked-by: Tomasz Figa <tomasz.figa@gmail.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 arch/arm/mach-exynos/suspend.c              | 2 +-
 include/linux/soc/samsung/exynos-regs-pmu.h | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
index d3db306a5a70..f3384e3a675d 100644
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -272,7 +272,7 @@ static int exynos5420_cpu_suspend(unsigned long arg)
 static void exynos_pm_set_wakeup_mask(void)
 {
 	/* Set wake-up mask registers */
-	pmu_raw_writel(exynos_get_eint_wake_mask(), S5P_EINT_WAKEUP_MASK);
+	pmu_raw_writel(exynos_get_eint_wake_mask(), EXYNOS_EINT_WAKEUP_MASK);
 	pmu_raw_writel(exynos_irqwake_intmask & ~(1 << 31), S5P_WAKEUP_MASK);
 }
 
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 66dcb9ec273a..eb0d240df7a7 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -42,7 +42,7 @@
 #define EXYNOS_SWRESET				0x0400
 
 #define S5P_WAKEUP_STAT				0x0600
-#define S5P_EINT_WAKEUP_MASK			0x0604
+#define EXYNOS_EINT_WAKEUP_MASK			0x0604
 #define S5P_WAKEUP_MASK				0x0608
 #define S5P_WAKEUP_MASK2				0x0614
 
@@ -180,6 +180,9 @@
 #define S5P_CORE_WAKEUP_FROM_LOCAL_CFG		(0x3 << 8)
 #define S5P_CORE_AUTOWAKEUP_EN			(1 << 31)
 
+/* Only for S5Pv210 */
+#define S5PV210_EINT_WAKEUP_MASK	0xC004
+
 /* Only for EXYNOS4210 */
 #define S5P_CMU_CLKSTOP_LCD1_LOWPWR	0x1154
 #define S5P_CMU_RESET_LCD1_LOWPWR	0x1174
@@ -641,6 +644,7 @@
 					 | EXYNOS5420_KFC_USE_STANDBY_WFI3)
 
 /* For EXYNOS5433 */
+#define EXYNOS5433_EINT_WAKEUP_MASK				(0x060C)
 #define EXYNOS5433_USBHOST30_PHY_CONTROL			(0x0728)
 #define EXYNOS5433_PAD_RETENTION_AUD_OPTION			(0x3028)
 #define EXYNOS5433_PAD_RETENTION_MMC2_OPTION			(0x30C8)
-- 
cgit v1.2.3


From a8be2af0218cf037704dc2e733bf56d6560fa324 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Mon, 23 Jul 2018 19:52:58 +0200
Subject: pinctrl: samsung: Write external wakeup interrupt mask

The pinctrl driver defines an IRQ chip which handles external wakeup
interrupts, therefore from logical point of view, it is the owner of
external interrupt mask.  The register controlling the mask belongs to
Power Management Unit address space so it has to be accessed with PMU
syscon regmap handle.

This mask should be written to hardware during system suspend.  Till now
ARMv7 machine code was responsible for this which created a dependency
between pin controller driver and arch/arm/mach code.

Try to rework this dependency so the pinctrl driver will write external
wakeup interrupt mask during late suspend.

Impact on ARMv7 designs (S5Pv210 and Exynos)
============================================
This duplicates setting mask with existing machine code
arch/arm/mach-exynos/suspend.c and arch/arm/mach-s5pv210/pm.c but it is
not a problem - the wakeup mask register will be written twice.  The
machine code will be cleaned up later.

The difference between implementation here and ARMv7 machine code
(arch/arm/mach-*) is the time of writing the mask:
1. The machine code is writing the mask quite late during system suspend
   path, after offlining secondary CPUs and just before doing actual
   suspend.
2. The implementation in pinctrl driver uses late suspend ops, therefore it
   will write the mask much earlier.  Hopefully late enough, after all
   drivers will enable or disable their interrupt wakeups
   (enable_irq_wake() etc).

Impact on ARMv8 designs (Exynos5433 and Exynos7)
================================================
The Suspend to RAM was not supported and external wakeup interrupt mask
was not written to HW.  This change brings us one step closer to
supporting Suspend to RAM.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Tomasz Figa <tomasz.figa@gmail.com>
Cc: Sylwester Nawrocki <snawrocki@kernel.org>
Acked-by: Tomasz Figa <tomasz.figa@gmail.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 drivers/pinctrl/samsung/pinctrl-exynos.c    | 50 ++++++++++++++++++++++++++++-
 drivers/pinctrl/samsung/pinctrl-samsung.h   |  3 ++
 include/linux/soc/samsung/exynos-regs-pmu.h |  2 ++
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.c b/drivers/pinctrl/samsung/pinctrl-exynos.c
index 29d86d704b0c..40ef14956876 100644
--- a/drivers/pinctrl/samsung/pinctrl-exynos.c
+++ b/drivers/pinctrl/samsung/pinctrl-exynos.c
@@ -25,6 +25,7 @@
 #include <linux/regmap.h>
 #include <linux/err.h>
 #include <linux/soc/samsung/exynos-pmu.h>
+#include <linux/soc/samsung/exynos-regs-pmu.h>
 
 #include <dt-bindings/pinctrl/samsung.h>
 
@@ -37,6 +38,8 @@ struct exynos_irq_chip {
 	u32 eint_con;
 	u32 eint_mask;
 	u32 eint_pend;
+	u32 eint_wake_mask_value;
+	u32 eint_wake_mask_reg;
 };
 
 static inline struct exynos_irq_chip *to_exynos_irq_chip(struct irq_chip *chip)
@@ -215,6 +218,7 @@ static struct exynos_irq_chip exynos_gpio_irq_chip = {
 	.eint_con = EXYNOS_GPIO_ECON_OFFSET,
 	.eint_mask = EXYNOS_GPIO_EMASK_OFFSET,
 	.eint_pend = EXYNOS_GPIO_EPEND_OFFSET,
+	/* eint_wake_mask_value not used */
 };
 
 static int exynos_eint_irq_map(struct irq_domain *h, unsigned int virq,
@@ -330,6 +334,8 @@ u32 exynos_get_eint_wake_mask(void)
 
 static int exynos_wkup_irq_set_wake(struct irq_data *irqd, unsigned int on)
 {
+	struct irq_chip *chip = irq_data_get_irq_chip(irqd);
+	struct exynos_irq_chip *our_chip = to_exynos_irq_chip(chip);
 	struct samsung_pin_bank *bank = irq_data_get_irq_chip_data(irqd);
 	unsigned long bit = 1UL << (2 * bank->eint_offset + irqd->hwirq);
 
@@ -339,6 +345,7 @@ static int exynos_wkup_irq_set_wake(struct irq_data *irqd, unsigned int on)
 		exynos_eint_wake_mask |= bit;
 	else
 		exynos_eint_wake_mask &= ~bit;
+	our_chip->eint_wake_mask_value = exynos_eint_wake_mask;
 
 	return 0;
 }
@@ -360,6 +367,9 @@ static const struct exynos_irq_chip s5pv210_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS_WKUP_EPEND_OFFSET,
+	.eint_wake_mask_value = EXYNOS_EINT_WAKEUP_MASK_DISABLED,
+	/* Only difference with exynos4210_wkup_irq_chip: */
+	.eint_wake_mask_reg = S5PV210_EINT_WAKEUP_MASK,
 };
 
 static const struct exynos_irq_chip exynos4210_wkup_irq_chip __initconst = {
@@ -376,6 +386,8 @@ static const struct exynos_irq_chip exynos4210_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS_WKUP_EPEND_OFFSET,
+	.eint_wake_mask_value = EXYNOS_EINT_WAKEUP_MASK_DISABLED,
+	.eint_wake_mask_reg = EXYNOS_EINT_WAKEUP_MASK,
 };
 
 static const struct exynos_irq_chip exynos7_wkup_irq_chip __initconst = {
@@ -392,6 +404,8 @@ static const struct exynos_irq_chip exynos7_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS7_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS7_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS7_WKUP_EPEND_OFFSET,
+	.eint_wake_mask_value = EXYNOS_EINT_WAKEUP_MASK_DISABLED,
+	.eint_wake_mask_reg = EXYNOS5433_EINT_WAKEUP_MASK,
 };
 
 /* list of external wakeup controllers supported */
@@ -560,6 +574,27 @@ int exynos_eint_wkup_init(struct samsung_pinctrl_drv_data *d)
 	return 0;
 }
 
+static void
+exynos_pinctrl_set_eint_wakeup_mask(struct samsung_pinctrl_drv_data *drvdata,
+				    struct exynos_irq_chip *irq_chip)
+{
+	struct regmap *pmu_regs;
+
+	if (!drvdata->retention_ctrl || !drvdata->retention_ctrl->priv) {
+		dev_warn(drvdata->dev,
+			 "No retention data configured bank with external wakeup interrupt. Wake-up mask will not be set.\n");
+		return;
+	}
+
+	pmu_regs = drvdata->retention_ctrl->priv;
+	dev_info(drvdata->dev,
+		 "Setting external wakeup interrupt wakeup mask: 0x%x\n",
+		 irq_chip->eint_wake_mask_value);
+
+	regmap_write(pmu_regs, irq_chip->eint_wake_mask_reg,
+		     irq_chip->eint_wake_mask_value);
+}
+
 static void exynos_pinctrl_suspend_bank(
 				struct samsung_pinctrl_drv_data *drvdata,
 				struct samsung_pin_bank *bank)
@@ -582,11 +617,24 @@ static void exynos_pinctrl_suspend_bank(
 void exynos_pinctrl_suspend(struct samsung_pinctrl_drv_data *drvdata)
 {
 	struct samsung_pin_bank *bank = drvdata->pin_banks;
+	struct exynos_irq_chip *irq_chip = NULL;
 	int i;
 
-	for (i = 0; i < drvdata->nr_banks; ++i, ++bank)
+	for (i = 0; i < drvdata->nr_banks; ++i, ++bank) {
 		if (bank->eint_type == EINT_TYPE_GPIO)
 			exynos_pinctrl_suspend_bank(drvdata, bank);
+		else if (bank->eint_type == EINT_TYPE_WKUP) {
+			if (!irq_chip) {
+				irq_chip = bank->irq_chip;
+				exynos_pinctrl_set_eint_wakeup_mask(drvdata,
+								    irq_chip);
+			} else if (bank->irq_chip != irq_chip) {
+				dev_warn(drvdata->dev,
+					 "More than one external wakeup interrupt chip configured (bank: %s). This is not supported by hardware nor by driver.\n",
+					 bank->name);
+			}
+		}
+	}
 }
 
 static void exynos_pinctrl_resume_bank(
diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h
index aac16cc8362a..e571bbd7139b 100644
--- a/drivers/pinctrl/samsung/pinctrl-samsung.h
+++ b/drivers/pinctrl/samsung/pinctrl-samsung.h
@@ -227,6 +227,9 @@ struct samsung_retention_data {
  *	device suspend, see samsung_pinctrl_suspend()
  * @resume: platform specific resume callback, executed during pin controller
  *	device suspend, see samsung_pinctrl_resume()
+ *
+ * External wakeup interrupts must define at least eint_wkup_init,
+ * retention_data and suspend in order for proper suspend/resume to work.
  */
 struct samsung_pin_ctrl {
 	const struct samsung_pin_bank_data *pin_banks;
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index eb0d240df7a7..5addaf5ccbce 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -42,6 +42,8 @@
 #define EXYNOS_SWRESET				0x0400
 
 #define S5P_WAKEUP_STAT				0x0600
+/* Value for EXYNOS_EINT_WAKEUP_MASK disabling all external wakeup interrupts */
+#define EXYNOS_EINT_WAKEUP_MASK_DISABLED	0xffffffff
 #define EXYNOS_EINT_WAKEUP_MASK			0x0604
 #define S5P_WAKEUP_MASK				0x0608
 #define S5P_WAKEUP_MASK2				0x0614
-- 
cgit v1.2.3


From 071f52fbce6161706d070ceada5accb81630bf02 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Jul 2018 09:52:32 +0200
Subject: block: remove bio_clone_kmalloc

Unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ab221c517f4e..b861baa59454 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,12 +443,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
 }
 
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
-	return bio_clone_bioset(bio, gfp_mask, NULL);
-
-}
-
 extern blk_qc_t submit_bio(struct bio *);
 
 extern void bio_endio(struct bio *);
-- 
cgit v1.2.3


From c55183c9aaa00d2bbb578169a480e31aff3d397c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Jul 2018 09:52:34 +0200
Subject: block: unexport bio_clone_bioset

Now only used by the bounce code, so move it there and mark the function
static.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 77 -----------------------------------------------------
 block/bounce.c      | 69 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/bio.h |  1 -
 3 files changed, 68 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 07d3ffd95989..b832151cd0bf 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -646,83 +646,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	struct bio *bio;
-
-	/*
-	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
-	 * bio_src->bi_io_vec to bio->bi_io_vec.
-	 *
-	 * We can't do that anymore, because:
-	 *
-	 *  - The point of cloning the biovec is to produce a bio with a biovec
-	 *    the caller can modify: bi_idx and bi_bvec_done should be 0.
-	 *
-	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
-	 *    we tried to clone the whole thing bio_alloc_bioset() would fail.
-	 *    But the clone should succeed as long as the number of biovecs we
-	 *    actually need to allocate is fewer than BIO_MAX_PAGES.
-	 *
-	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code
-	 *    that does not own the bio - reason being drivers don't use it for
-	 *    iterating over the biovec anymore, so expecting it to be kept up
-	 *    to date (i.e. for clones that share the parent biovec) is just
-	 *    asking for trouble and would force extra work on
-	 *    __bio_clone_fast() anyways.
-	 */
-
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-	if (!bio)
-		return NULL;
-	bio->bi_disk		= bio_src->bi_disk;
-	bio->bi_opf		= bio_src->bi_opf;
-	bio->bi_write_hint	= bio_src->bi_write_hint;
-	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
-	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		break;
-	case REQ_OP_WRITE_SAME:
-		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		break;
-	default:
-		bio_for_each_segment(bv, bio_src, iter)
-			bio->bi_io_vec[bio->bi_vcnt++] = bv;
-		break;
-	}
-
-	if (bio_integrity(bio_src)) {
-		int ret;
-
-		ret = bio_integrity_clone(bio, bio_src, gfp_mask);
-		if (ret < 0) {
-			bio_put(bio);
-			return NULL;
-		}
-	}
-
-	bio_clone_blkcg_association(bio, bio_src);
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/block/bounce.c b/block/bounce.c
index fd31347b7836..bc63b3a2d18c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
 	__bounce_end_io_read(bio, &isa_page_pool);
 }
 
+static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
+		struct bio_set *bs)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	struct bio *bio;
+
+	/*
+	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+	 * bio_src->bi_io_vec to bio->bi_io_vec.
+	 *
+	 * We can't do that anymore, because:
+	 *
+	 *  - The point of cloning the biovec is to produce a bio with a biovec
+	 *    the caller can modify: bi_idx and bi_bvec_done should be 0.
+	 *
+	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+	 *    we tried to clone the whole thing bio_alloc_bioset() would fail.
+	 *    But the clone should succeed as long as the number of biovecs we
+	 *    actually need to allocate is fewer than BIO_MAX_PAGES.
+	 *
+	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code
+	 *    that does not own the bio - reason being drivers don't use it for
+	 *    iterating over the biovec anymore, so expecting it to be kept up
+	 *    to date (i.e. for clones that share the parent biovec) is just
+	 *    asking for trouble and would force extra work on
+	 *    __bio_clone_fast() anyways.
+	 */
+
+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+	if (!bio)
+		return NULL;
+	bio->bi_disk		= bio_src->bi_disk;
+	bio->bi_opf		= bio_src->bi_opf;
+	bio->bi_write_hint	= bio_src->bi_write_hint;
+	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
+		break;
+	case REQ_OP_WRITE_SAME:
+		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+		break;
+	default:
+		bio_for_each_segment(bv, bio_src, iter)
+			bio->bi_io_vec[bio->bi_vcnt++] = bv;
+		break;
+	}
+
+	if (bio_integrity(bio_src)) {
+		int ret;
+
+		ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+		if (ret < 0) {
+			bio_put(bio);
+			return NULL;
+		}
+	}
+
+	bio_clone_blkcg_association(bio, bio_src);
+
+	return bio;
+}
+
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 			       mempool_t *pool)
 {
@@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+	bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
 			&bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b861baa59454..51371740d2a8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -429,7 +429,6 @@ extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
 extern struct bio_set fs_bio_set;
 
-- 
cgit v1.2.3


From 6e30396767508101eacec8b93b068e8905e660dc Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 20 Jun 2018 22:32:42 +0530
Subject: sched/numa: Remove redundant field

'numa_entry' is a struct list_head defined in task_struct, but never used.

No functional change.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1529514181-9842-2-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..e0f4f56c9310 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1017,7 +1017,6 @@ struct task_struct {
 	u64				last_sum_exec_runtime;
 	struct callback_head		numa_work;
 
-	struct list_head		numa_entry;
 	struct numa_group		*numa_group;
 
 	/*
-- 
cgit v1.2.3


From fd2efaa4eb5317c3a86357a83a7d456a1b86a0ac Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Jul 2018 12:30:11 +0100
Subject: locking/atomics: Rework ordering barriers

Currently architectures can override __atomic_op_*() to define the barriers
used before/after a relaxed atomic when used to build acquire/release/fence
variants.

This has the unfortunate property of requiring the architecture to define the
full wrapper for the atomics, rather than just the barriers they care about,
and gets in the way of generating atomics which can be easily read.

Instead, this patch has architectures define an optional set of barriers:

* __atomic_acquire_fence()
* __atomic_release_fence()
* __atomic_pre_full_fence()
* __atomic_post_full_fence()

... which <linux/atomic.h> uses to build the wrappers.

It would be nice if we could undef these, along with the __atomic_op_*()
wrappers, but that would break the cmpxchg() wrappers, which are written
in preprocessor. Undefs would have been nice, but alas.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andy.shevchenko@gmail.com
Cc: arnd@arndb.de
Cc: aryabinin@virtuozzo.com
Cc: catalin.marinas@arm.com
Cc: dvyukov@google.com
Cc: glider@google.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: peter@hurleysoftware.com
Link: http://lkml.kernel.org/r/20180716113017.3909-7-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/atomic.h   |  8 ++++----
 arch/powerpc/include/asm/atomic.h | 17 +++++------------
 arch/riscv/include/asm/atomic.h   | 17 +++++------------
 include/linux/atomic.h            | 38 ++++++++++++++++++++++----------------
 4 files changed, 36 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 4a6a8f58c9c9..150a1c5d6a2c 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -18,11 +18,11 @@
  * To ensure dependency ordering is preserved for the _relaxed and
  * _release atomics, an smp_read_barrier_depends() is unconditionally
  * inserted into the _relaxed variants, which are used to build the
- * barriered versions. To avoid redundant back-to-back fences, we can
- * define the _acquire and _fence versions explicitly.
+ * barriered versions. Avoid redundant back-to-back fences in the
+ * _acquire and _fence versions.
  */
-#define __atomic_op_acquire(op, args...)	op##_relaxed(args)
-#define __atomic_op_fence			__atomic_op_release
+#define __atomic_acquire_fence()
+#define __atomic_post_full_fence()
 
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index a0156cb43d1f..963abf8bf1c0 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -18,18 +18,11 @@
  * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
  * on the platform without lwsync.
  */
-#define __atomic_op_acquire(op, args...)				\
-({									\
-	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
-	__asm__ __volatile__(PPC_ACQUIRE_BARRIER "" : : : "memory");	\
-	__ret;								\
-})
-
-#define __atomic_op_release(op, args...)				\
-({									\
-	__asm__ __volatile__(PPC_RELEASE_BARRIER "" : : : "memory");	\
-	op##_relaxed(args);						\
-})
+#define __atomic_acquire_fence()					\
+	__asm__ __volatile__(PPC_ACQUIRE_BARRIER "" : : : "memory")
+
+#define __atomic_release_fence()					\
+	__asm__ __volatile__(PPC_RELEASE_BARRIER "" : : : "memory")
 
 static __inline__ int atomic_read(const atomic_t *v)
 {
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 512b89485790..c452359c9cb8 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -25,18 +25,11 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define __atomic_op_acquire(op, args...)				\
-({									\
-	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
-	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory");	\
-	__ret;								\
-})
-
-#define __atomic_op_release(op, args...)				\
-({									\
-	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");	\
-	op##_relaxed(args);						\
-})
+#define __atomic_acquire_fence()					\
+	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
+
+#define __atomic_release_fence()					\
+	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");
 
 static __always_inline int atomic_read(const atomic_t *v)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 8e04f1f69bd9..1e8e88bdaf09 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -38,40 +38,46 @@
  * barriers on top of the relaxed variant. In the case where the relaxed
  * variant is already fully ordered, no additional barriers are needed.
  *
- * Besides, if an arch has a special barrier for acquire/release, it could
- * implement its own __atomic_op_* and use the same framework for building
- * variants
- *
- * If an architecture overrides __atomic_op_acquire() it will probably want
- * to define smp_mb__after_spinlock().
+ * If an architecture overrides __atomic_acquire_fence() it will probably
+ * want to define smp_mb__after_spinlock().
  */
-#ifndef __atomic_op_acquire
+#ifndef __atomic_acquire_fence
+#define __atomic_acquire_fence		smp_mb__after_atomic
+#endif
+
+#ifndef __atomic_release_fence
+#define __atomic_release_fence		smp_mb__before_atomic
+#endif
+
+#ifndef __atomic_pre_full_fence
+#define __atomic_pre_full_fence		smp_mb__before_atomic
+#endif
+
+#ifndef __atomic_post_full_fence
+#define __atomic_post_full_fence	smp_mb__after_atomic
+#endif
+
 #define __atomic_op_acquire(op, args...)				\
 ({									\
 	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
-	smp_mb__after_atomic();						\
+	__atomic_acquire_fence();					\
 	__ret;								\
 })
-#endif
 
-#ifndef __atomic_op_release
 #define __atomic_op_release(op, args...)				\
 ({									\
-	smp_mb__before_atomic();					\
+	__atomic_release_fence();					\
 	op##_relaxed(args);						\
 })
-#endif
 
-#ifndef __atomic_op_fence
 #define __atomic_op_fence(op, args...)					\
 ({									\
 	typeof(op##_relaxed(args)) __ret;				\
-	smp_mb__before_atomic();					\
+	__atomic_pre_full_fence();					\
 	__ret = op##_relaxed(args);					\
-	smp_mb__after_atomic();						\
+	__atomic_post_full_fence();					\
 	__ret;								\
 })
-#endif
 
 /* atomic_add_return_relaxed */
 #ifndef atomic_add_return_relaxed
-- 
cgit v1.2.3


From d27fb99f62af7b79c542d161aa5155ed57271ddc Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 23 Jul 2018 22:42:48 +0100
Subject: dma-mapping: relax warning for per-device areas

The reasons why dma_free_attrs() should not be called from IRQ context
are not necessarily obvious and somewhat buried in the development
history, so let's start by documenting the warning itself to help anyone
who does happen to hit it and wonder what the deal is.

However, this check turns out to be slightly over-restrictive for the
way that per-device memory has been spliced into the general API, since
for that case we know that dma_declare_coherent_memory() has created an
appropriate CPU mapping for the entire area and nothing dynamic should
be happening. Given that the usage model for per-device memory is often
more akin to streaming DMA than 'real' coherent DMA (e.g. allocating and
freeing space to copy short-lived packets in and out), it is also
somewhat more reasonable for those operations to happen in IRQ handlers
for such devices.

Therefore, let's move the irqs_disabled() check down past the per-device
area hook, so that that gets a chance to resolve the request before we
reach definite "you're doing it wrong" territory.

Reported-by: Fredrik Noring <noring@nocrew.org>
Tested-by: Fredrik Noring <noring@nocrew.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f9cc309507d9..1db6a6b46d0d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -538,10 +538,17 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!ops);
-	WARN_ON(irqs_disabled());
 
 	if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
 		return;
+	/*
+	 * On non-coherent platforms which implement DMA-coherent buffers via
+	 * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
+	 * this far in IRQ context is a) at risk of a BUG_ON() or trying to
+	 * sleep on some machines, and b) an indication that the driver is
+	 * probably misusing the coherent API anyway.
+	 */
+	WARN_ON(irqs_disabled());
 
 	if (!ops->free || !cpu_addr)
 		return;
-- 
cgit v1.2.3


From 3c507b8af638c67d4a80d70091d2057ecb01e8a6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 23 Jul 2018 21:40:07 +0200
Subject: net: phy: add helper phy_polling_mode

Add a helper for checking whether polling is used to detect PHY status
changes.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c |  8 ++++----
 include/linux/phy.h   | 10 ++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 04780db4c963..1ee25877c4d1 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -525,7 +525,7 @@ static int phy_start_aneg_priv(struct phy_device *phydev, bool sync)
 	 * negotiation may already be done and aneg interrupt may not be
 	 * generated.
 	 */
-	if (phydev->irq != PHY_POLL && phydev->state == PHY_AN) {
+	if (!phy_polling_mode(phydev) && phydev->state == PHY_AN) {
 		err = phy_aneg_done(phydev);
 		if (err > 0) {
 			trigger = true;
@@ -983,7 +983,7 @@ void phy_state_machine(struct work_struct *work)
 			needs_aneg = true;
 		break;
 	case PHY_NOLINK:
-		if (phydev->irq != PHY_POLL)
+		if (!phy_polling_mode(phydev))
 			break;
 
 		err = phy_read_status(phydev);
@@ -1024,7 +1024,7 @@ void phy_state_machine(struct work_struct *work)
 		/* Only register a CHANGE if we are polling and link changed
 		 * since latest checking.
 		 */
-		if (phydev->irq == PHY_POLL) {
+		if (phy_polling_mode(phydev)) {
 			old_link = phydev->link;
 			err = phy_read_status(phydev);
 			if (err)
@@ -1123,7 +1123,7 @@ void phy_state_machine(struct work_struct *work)
 	 * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
 	 * between states from phy_mac_interrupt()
 	 */
-	if (phydev->irq == PHY_POLL)
+	if (phy_polling_mode(phydev))
 		queue_delayed_work(system_power_efficient_wq, &phydev->state_queue,
 				   PHY_STATE_TIME * HZ);
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 075c2f770d3e..cd6f637cbbfb 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -824,6 +824,16 @@ static inline bool phy_interrupt_is_valid(struct phy_device *phydev)
 	return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT;
 }
 
+/**
+ * phy_polling_mode - Convenience function for testing whether polling is
+ * used to detect PHY status changes
+ * @phydev: the phy_device struct
+ */
+static inline bool phy_polling_mode(struct phy_device *phydev)
+{
+	return phydev->irq == PHY_POLL;
+}
+
 /**
  * phy_is_internal - Convenience function for testing if a PHY is internal
  * @phydev: the phy_device struct
-- 
cgit v1.2.3


From 6f4ceee9305dc3fe74099159b460f4b56b506f1d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 24 Jul 2018 14:26:04 -0400
Subject: cpu/hotplug: Add a cpus_read_trylock() function

There are use cases where it can be useful to have a cpus_read_trylock()
function to work around circular lock dependency problem involving
the cpu_hotplug_lock.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpu.h | 2 ++
 kernel/cpu.c        | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a97a63eef59f..e850bfea3e84 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -103,6 +103,7 @@ extern void cpus_write_lock(void);
 extern void cpus_write_unlock(void);
 extern void cpus_read_lock(void);
 extern void cpus_read_unlock(void);
+extern int  cpus_read_trylock(void);
 extern void lockdep_assert_cpus_held(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
@@ -115,6 +116,7 @@ static inline void cpus_write_lock(void) { }
 static inline void cpus_write_unlock(void) { }
 static inline void cpus_read_lock(void) { }
 static inline void cpus_read_unlock(void) { }
+static inline int  cpus_read_trylock(void) { return true; }
 static inline void lockdep_assert_cpus_held(void) { }
 static inline void cpu_hotplug_disable(void) { }
 static inline void cpu_hotplug_enable(void) { }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0db8938fbb23..307486baa477 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -290,6 +290,12 @@ void cpus_read_lock(void)
 }
 EXPORT_SYMBOL_GPL(cpus_read_lock);
 
+int cpus_read_trylock(void)
+{
+	return percpu_down_read_trylock(&cpu_hotplug_lock);
+}
+EXPORT_SYMBOL_GPL(cpus_read_trylock);
+
 void cpus_read_unlock(void)
 {
 	percpu_up_read(&cpu_hotplug_lock);
-- 
cgit v1.2.3


From 359f642700f2ff05d9c94cd9216c97af7b8e9553 Mon Sep 17 00:00:00 2001
From: Greg Edwards <gedwards@ddn.com>
Date: Wed, 25 Jul 2018 10:22:58 -0400
Subject: block: move bio_integrity_{intervals,bytes} into blkdev.h

This allows bio_integrity_bytes() to be called from drivers instead of
open coding it.

Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c  | 22 ----------------------
 include/linux/blkdev.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index add7c7c85335..67b5fb861a51 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -159,28 +159,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi:		blk_integrity profile for device
- * @sectors:	Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device.  Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-						   unsigned int sectors)
-{
-	return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-					       unsigned int sectors)
-{
-	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 331a6cb8805f..050d599f5ea9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1865,6 +1865,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 				bip_next->bip_vec[0].bv_offset);
 }
 
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+						   unsigned int sectors)
+{
+	return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 struct bio;
@@ -1938,6 +1960,18 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 	return false;
 }
 
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+						   unsigned int sectors)
+{
+	return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 struct block_device_operations {
-- 
cgit v1.2.3


From 038709071328ff68a936c6b8c33a24a805eea3c5 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu5@cn.bosch.com>
Date: Wed, 13 Jun 2018 16:37:17 +0200
Subject: can: dev: enable multi-queue for SocketCAN devices

The existing SocketCAN implementation provides alloc_candev() to
allocate a CAN device using a single Tx and Rx queue. This can lead to
priority inversion in case the single Tx queue is already full with low
priority messages and a high priority message needs to be sent while the
bus is fully loaded with medium priority messages.

This problem can be solved by using the existing multi-queue support of
the network subsytem. The commit makes it possible to use multi-queue in
the CAN subsystem in the same way it is used in the Ethernet subsystem
by adding an alloc_candev_mqs() call and accompanying macros. With this
support a CAN device can use multi-queue qdisc (e.g. mqprio) to avoid
the aforementioned priority inversion.

The exisiting functionality of alloc_candev() is the same as before.

CAN devices need to have prioritized multiple hardware queues or are
able to abort waiting for arbitration to make sensible use of
multi-queues.

Signed-off-by: Zhu Yi <yi.zhu5@cn.bosch.com>
Signed-off-by: Mark Jonas <mark.jonas@de.bosch.com>
Reviewed-by: Heiko Schocher <hs@denx.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c   | 8 +++++---
 include/linux/can/dev.h | 7 ++++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 5bb6e8896601..49163570a63a 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -701,7 +701,8 @@ EXPORT_SYMBOL_GPL(alloc_can_err_skb);
 /*
  * Allocate and setup space for the CAN network device
  */
-struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max)
+struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
+				    unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
 	struct can_priv *priv;
@@ -713,7 +714,8 @@ struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max)
 	else
 		size = sizeof_priv;
 
-	dev = alloc_netdev(size, "can%d", NET_NAME_UNKNOWN, can_setup);
+	dev = alloc_netdev_mqs(size, "can%d", NET_NAME_UNKNOWN, can_setup,
+			       txqs, rxqs);
 	if (!dev)
 		return NULL;
 
@@ -732,7 +734,7 @@ struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max)
 
 	return dev;
 }
-EXPORT_SYMBOL_GPL(alloc_candev);
+EXPORT_SYMBOL_GPL(alloc_candev_mqs);
 
 /*
  * Free space of the CAN network device
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 055aaf5ed9af..a83e1f632eb7 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -143,7 +143,12 @@ u8 can_dlc2len(u8 can_dlc);
 /* map the sanitized data length to an appropriate data length code */
 u8 can_len2dlc(u8 len);
 
-struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max);
+struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
+				    unsigned int txqs, unsigned int rxqs);
+#define alloc_candev(sizeof_priv, echo_skb_max) \
+	alloc_candev_mqs(sizeof_priv, echo_skb_max, 1, 1)
+#define alloc_candev_mq(sizeof_priv, echo_skb_max, count) \
+	alloc_candev_mqs(sizeof_priv, echo_skb_max, count, count)
 void free_candev(struct net_device *dev);
 
 /* a candev safe wrapper around netdev_priv */
-- 
cgit v1.2.3


From 1fb2e3f276ddafee81073d884f599cd2574c31e2 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 17 Jul 2018 18:05:36 +0200
Subject: lib/crc: Move polynomial definition to separate header

Allow other drivers and parts of kernel to use the same define for
CRC32 polynomial, instead of duplicating it in many places.  This code
does not bring any functional changes, except moving existing code.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crc32poly.h | 20 ++++++++++++++++++++
 lib/crc32.c               |  1 +
 lib/crc32defs.h           | 14 --------------
 lib/gen_crc32table.c      |  1 +
 4 files changed, 22 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/crc32poly.h

(limited to 'include/linux')

diff --git a/include/linux/crc32poly.h b/include/linux/crc32poly.h
new file mode 100644
index 000000000000..7ad5aa92d3c7
--- /dev/null
+++ b/include/linux/crc32poly.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CRC32_POLY_H
+#define _LINUX_CRC32_POLY_H
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+/*
+ * This is the CRC32c polynomial, as outlined by Castagnoli.
+ * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
+ * x^8+x^6+x^0
+ */
+#define CRC32C_POLY_LE 0x82F63B78
+
+#endif /* _LINUX_CRC32_POLY_H */
diff --git a/lib/crc32.c b/lib/crc32.c
index 2ef20fe84b69..341c54cb4edf 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -27,6 +27,7 @@
 /* see: Documentation/crc32.txt for a description of algorithms */
 
 #include <linux/crc32.h>
+#include <linux/crc32poly.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/sched.h>
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index cb275a28a750..0c8fb5923e7e 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -1,18 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*
- * There are multiple 16-bit CRC polynomials in common use, but this is
- * *the* standard CRC-32 polynomial, first popularized by Ethernet.
- * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
- */
-#define CRCPOLY_LE 0xedb88320
-#define CRCPOLY_BE 0x04c11db7
-
-/*
- * This is the CRC32c polynomial, as outlined by Castagnoli.
- * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
- * x^8+x^6+x^0
- */
-#define CRC32C_POLY_LE 0x82F63B78
 
 /* Try to choose an implementation variant via Kconfig */
 #ifdef CONFIG_CRC32_SLICEBY8
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index 8f26660ea10a..34c3bc826f45 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <stdio.h>
+#include "../include/linux/crc32poly.h"
 #include "../include/generated/autoconf.h"
 #include "crc32defs.h"
 #include <inttypes.h>
-- 
cgit v1.2.3


From e37f2f93afe594682702439ca34eb8130598cdf2 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 17 Jul 2018 18:05:37 +0200
Subject: lib/crc: Use consistent naming for CRC-32 polynomials

Header was defining CRCPOLY_LE/BE and CRC32C_POLY_LE but in fact all of
them are CRC-32 polynomials so use consistent naming.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crc32poly.h |  4 ++--
 lib/crc32.c               | 10 +++++-----
 lib/gen_crc32table.c      |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc32poly.h b/include/linux/crc32poly.h
index 7ad5aa92d3c7..62c4b7790a28 100644
--- a/include/linux/crc32poly.h
+++ b/include/linux/crc32poly.h
@@ -7,8 +7,8 @@
  * *the* standard CRC-32 polynomial, first popularized by Ethernet.
  * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
  */
-#define CRCPOLY_LE 0xedb88320
-#define CRCPOLY_BE 0x04c11db7
+#define CRC32_POLY_LE 0xedb88320
+#define CRC32_POLY_BE 0x04c11db7
 
 /*
  * This is the CRC32c polynomial, as outlined by Castagnoli.
diff --git a/lib/crc32.c b/lib/crc32.c
index 341c54cb4edf..a6c9afafc8c8 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -185,7 +185,7 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
 #if CRC_LE_BITS == 1
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
-	return crc32_le_generic(crc, p, len, NULL, CRCPOLY_LE);
+	return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
 }
 u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 {
@@ -195,7 +195,7 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
 	return crc32_le_generic(crc, p, len,
-			(const u32 (*)[256])crc32table_le, CRCPOLY_LE);
+			(const u32 (*)[256])crc32table_le, CRC32_POLY_LE);
 }
 u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 {
@@ -269,7 +269,7 @@ static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len,
 
 u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len)
 {
-	return crc32_generic_shift(crc, len, CRCPOLY_LE);
+	return crc32_generic_shift(crc, len, CRC32_POLY_LE);
 }
 
 u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len)
@@ -331,13 +331,13 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
 #if CRC_LE_BITS == 1
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
-	return crc32_be_generic(crc, p, len, NULL, CRCPOLY_BE);
+	return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
 }
 #else
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
 	return crc32_be_generic(crc, p, len,
-			(const u32 (*)[256])crc32table_be, CRCPOLY_BE);
+			(const u32 (*)[256])crc32table_be, CRC32_POLY_BE);
 }
 #endif
 EXPORT_SYMBOL(crc32_be);
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index 34c3bc826f45..f755b997b967 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -58,7 +58,7 @@ static void crc32init_le_generic(const uint32_t polynomial,
 
 static void crc32init_le(void)
 {
-	crc32init_le_generic(CRCPOLY_LE, crc32table_le);
+	crc32init_le_generic(CRC32_POLY_LE, crc32table_le);
 }
 
 static void crc32cinit_le(void)
@@ -77,7 +77,7 @@ static void crc32init_be(void)
 	crc32table_be[0][0] = 0;
 
 	for (i = 1; i < BE_TABLE_SIZE; i <<= 1) {
-		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRCPOLY_BE : 0);
+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0);
 		for (j = 0; j < i; j++)
 			crc32table_be[0][i + j] = crc ^ crc32table_be[0][j];
 	}
-- 
cgit v1.2.3


From f07d141fe9430cdf9f8a65a87c4136bd83b8ab2e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 23 Jul 2018 23:16:07 +0100
Subject: dma-mapping: Generalise dma_32bit_limit flag

Whilst the notion of an upstream DMA restriction is most commonly seen
in PCI host bridges saddled with a 32-bit native interface, a more
general version of the same issue can exist on complex SoCs where a bus
or point-to-point interconnect link from a device's DMA master interface
to another component along the path to memory (often an IOMMU) may carry
fewer address bits than the interfaces at both ends nominally support.
In order to properly deal with this, the first step is to expand the
dma_32bit_limit flag into an arbitrary mask.

To minimise the impact on existing code, we'll make sure to only
consider this new mask valid if set. That makes sense anyway, since a
mask of zero would represent DMA not being wired up at all, and that
would be better handled by not providing valid ops in the first place.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/kernel/pci-dma.c | 2 +-
 include/linux/device.h    | 6 +++---
 kernel/dma/direct.c       | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index ab5d9dd668d2..80f9fe8d27d0 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -175,7 +175,7 @@ rootfs_initcall(pci_iommu_init);
 
 static int via_no_dac_cb(struct pci_dev *pdev, void *data)
 {
-	pdev->dev.dma_32bit_limit = true;
+	pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
 	return 0;
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 055a69dbcd18..6d3b000be57e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -886,6 +886,8 @@ struct dev_links_info {
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
  * 		hardware supports 64-bit addresses for consistent allocations
  * 		such descriptors.
+ * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA
+ *		limit than the device itself supports.
  * @dma_pfn_offset: offset of DMA memory range relatively of RAM
  * @dma_parms:	A low level driver may set these to teach IOMMU code about
  * 		segment limitations.
@@ -912,8 +914,6 @@ struct dev_links_info {
  * @offline:	Set after successful invocation of bus type's .offline().
  * @of_node_reused: Set if the device-tree node is shared with an ancestor
  *              device.
- * @dma_32bit_limit: bridge limited to 32bit DMA even if the device itself
- *		indicates support for a higher limit in the dma_mask field.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -967,6 +967,7 @@ struct device {
 					     not all hardware supports
 					     64 bit addresses for consistent
 					     allocations such descriptors. */
+	u64		bus_dma_mask;	/* upstream dma_mask constraint */
 	unsigned long	dma_pfn_offset;
 
 	struct device_dma_parameters *dma_parms;
@@ -1002,7 +1003,6 @@ struct device {
 	bool			offline_disabled:1;
 	bool			offline:1;
 	bool			of_node_reused:1;
-	bool			dma_32bit_limit:1;
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8be8106270c2..c2860c5a9e96 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -180,10 +180,10 @@ int dma_direct_supported(struct device *dev, u64 mask)
 		return 0;
 #endif
 	/*
-	 * Various PCI/PCIe bridges have broken support for > 32bit DMA even
-	 * if the device itself might support it.
+	 * Upstream PCI/PCIe bridges or SoC interconnects may not carry
+	 * as many DMA address bits as the device itself supports.
 	 */
-	if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+	if (dev->bus_dma_mask && mask > dev->bus_dma_mask)
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 9b89bc3857a6c0dfda18ddae2a42c114ecc32753 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 12 May 2018 18:18:12 +0200
Subject: nvme.h: add support for the log specific field

NVMe 1.3 added a new log specific field to the get log page CQ
defintion, add it to our get_log_page SQ structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 include/linux/nvme.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 80dfedcf0bf7..39f05b0b8c7f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -885,7 +885,7 @@ struct nvme_get_log_page_command {
 	__u64			rsvd2[2];
 	union nvme_data_ptr	dptr;
 	__u8			lid;
-	__u8			rsvd10;
+	__u8			lsp; /* upper 4 bits reserved */
 	__le16			numdl;
 	__le16			numdu;
 	__u16			rsvd11;
-- 
cgit v1.2.3


From 1a37621658fe06b10cf8bac02c32304d2a1c888c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 May 2018 18:53:57 +0200
Subject: nvme.h: add ANA definitions

Add various defintions from NVMe 1.3 TP 4004.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 include/linux/nvme.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 39f05b0b8c7f..64c9175723de 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,7 +242,12 @@ struct nvme_id_ctrl {
 	__le32			sanicap;
 	__le32			hmminds;
 	__le16			hmmaxd;
-	__u8			rsvd338[174];
+	__u8			rsvd338[4];
+	__u8			anatt;
+	__u8			anacap;
+	__le32			anagrpmax;
+	__le32			nanagrpid;
+	__u8			rsvd352[160];
 	__u8			sqes;
 	__u8			cqes;
 	__le16			maxcmd;
@@ -258,7 +263,8 @@ struct nvme_id_ctrl {
 	__le16			acwu;
 	__u8			rsvd534[2];
 	__le32			sgls;
-	__u8			rsvd540[228];
+	__le32			mnan;
+	__u8			rsvd544[224];
 	char			subnqn[256];
 	__u8			rsvd1024[768];
 	__le32			ioccsz;
@@ -312,7 +318,9 @@ struct nvme_id_ns {
 	__le16			nabspf;
 	__le16			noiob;
 	__u8			nvmcap[16];
-	__u8			rsvd64[40];
+	__u8			rsvd64[28];
+	__le32			anagrpid;
+	__u8			rsvd96[8];
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
@@ -425,6 +433,32 @@ struct nvme_effects_log {
 	__u8   resv[2048];
 };
 
+enum nvme_ana_state {
+	NVME_ANA_OPTIMIZED		= 0x01,
+	NVME_ANA_NONOPTIMIZED		= 0x02,
+	NVME_ANA_INACCESSIBLE		= 0x03,
+	NVME_ANA_PERSISTENT_LOSS	= 0x04,
+	NVME_ANA_CHANGE			= 0x0f,
+};
+
+struct nvme_ana_group_desc {
+	__le32	grpid;
+	__le32	nnsids;
+	__le64	chgcnt;
+	__u8	state;
+	__u8	rsvd17[7];
+	__le32	nsids[];
+};
+
+/* flag for the log specific field of the ANA log */
+#define NVME_ANA_LOG_RGO	(1 << 0)
+
+struct nvme_ana_rsp_hdr {
+	__le64	chgcnt;
+	__le16	ngrps;
+	__le16	rsvd10[3];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -444,11 +478,13 @@ enum {
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
+	NVME_AER_NOTICE_ANA		= 0x03,
 };
 
 enum {
 	NVME_AEN_CFG_NS_ATTR		= 1 << 8,
 	NVME_AEN_CFG_FW_ACT		= 1 << 9,
+	NVME_AEN_CFG_ANA_CHANGE		= 1 << 11,
 };
 
 struct nvme_lba_range_type {
@@ -763,6 +799,7 @@ enum {
 	NVME_LOG_FW_SLOT	= 0x03,
 	NVME_LOG_CHANGED_NS	= 0x04,
 	NVME_LOG_CMD_EFFECTS	= 0x05,
+	NVME_LOG_ANA		= 0x0c,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
@@ -1185,6 +1222,13 @@ enum {
 	NVME_SC_ACCESS_DENIED		= 0x286,
 	NVME_SC_UNWRITTEN_BLOCK		= 0x287,
 
+	/*
+	 * Path-related Errors:
+	 */
+	NVME_SC_ANA_PERSISTENT_LOSS	= 0x301,
+	NVME_SC_ANA_INACCESSIBLE	= 0x302,
+	NVME_SC_ANA_TRANSITION		= 0x303,
+
 	NVME_SC_DNR			= 0x4000,
 };
 
-- 
cgit v1.2.3


From 22a65aa8b1a84ca429c0fc8415dee5681ab36eb3 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 25 Dec 2017 18:40:52 +0200
Subject: net/mlx5e: Vxlan, check maximum number of UDP ports

The NIC has a limited number of offloaded VXLAN UDP ports (usually 4).
Instead of letting the firmware fail when trying to add more ports than
it can handle, let the driver check it on its own.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 14 ++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   |  4 +++-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c41cfc2a4b70..c4d4db8722f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -657,6 +657,7 @@ enum {
 struct mlx5e_vxlan_db {
 	spinlock_t			lock; /* protect vxlan table */
 	struct radix_tree_root		tree;
+	int				num_ports;
 };
 
 struct mlx5e_l2_rule {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 2f699998d13e..e3af2efe18ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -52,6 +52,11 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv)
 		mlx5e_vxlan_add_port(priv, 4789);
 }
 
+static inline u8 mlx5e_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
+{
+	return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
+}
+
 static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
 {
 	u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
@@ -98,6 +103,13 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 		return;
 	}
 
+	if (vxlan_db->num_ports >= mlx5e_vxlan_max_udp_ports(priv->mdev)) {
+		netdev_info(priv->netdev,
+			    "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n",
+			    port, mlx5e_vxlan_max_udp_ports(priv->mdev));
+		return;
+	}
+
 	if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
 		return;
 
@@ -114,6 +126,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port)
 	if (err)
 		goto err_free;
 
+	vxlan_db->num_ports++;
 	return;
 
 err_free:
@@ -163,6 +176,7 @@ out_unlock:
 	if (remove) {
 		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
 		kfree(vxlan);
+		vxlan_db->num_ports--;
 	}
 	mutex_unlock(&priv->state_lock);
 	kfree(vxlan_work);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 22f54bedfaae..60c2308fe062 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -668,7 +668,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp[0x1];
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
-	u8         reserved_at_23[0x1b];
+	u8         reserved_at_23[0xd];
+	u8         max_vxlan_udp_ports[0x8];
+	u8         reserved_at_38[0x6];
 	u8         max_geneve_opt_len[0x1];
 	u8         tunnel_stateless_geneve_rx[0x1];
 
-- 
cgit v1.2.3


From 358aa5ce288aa1085f0f3ef9f315119563fa6541 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 9 May 2018 13:28:00 -0700
Subject: net/mlx5e: Vxlan, move vxlan logic to core driver

Move vxlan logic and objects to mlx5 core dirver.
Since it going to be used from different mlx5 interfaces.
e.g. mlx5e PF NIC netdev and mlx5e E-Switch representors.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   2 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  21 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   6 +-
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c    | 230 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.h    |  64 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c    | 230 ---------------------
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h    |  64 ------
 include/linux/mlx5/driver.h                        |   2 +
 10 files changed, 315 insertions(+), 313 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index ae2bdcb1647c..f20fda1ced4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
 		fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
-		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \
-		en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
+		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
+		en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o lib/vxlan.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1bd4536b9061..c7ed3d20fd54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -52,7 +52,6 @@
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
-#include "vxlan.h"
 
 struct page_pool;
 
@@ -812,7 +811,6 @@ struct mlx5e_priv {
 	u32                        tx_rates[MLX5E_MAX_NUM_SQS];
 
 	struct mlx5e_flow_steering fs;
-	struct mlx5_vxlan          *vxlan;
 
 	struct workqueue_struct    *wq;
 	struct work_struct         update_carrier_work;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ef4b2f0c427c..fde35021a257 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -45,7 +45,7 @@
 #include "en_accel/tls.h"
 #include "accel/ipsec.h"
 #include "accel/tls.h"
-#include "vxlan.h"
+#include "lib/vxlan.h"
 #include "en/port.h"
 #include "en/xdp.h"
 
@@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev)
 		mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
 	mutex_unlock(&priv->state_lock);
 
-	if (mlx5_vxlan_allowed(priv->vxlan))
+	if (mlx5_vxlan_allowed(priv->mdev->vxlan))
 		udp_tunnel_get_rx_info(netdev);
 
 	return err;
@@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work)
 	u16 port = vxlan_work->port;
 
 	mutex_lock(&priv->state_lock);
-	mlx5_vxlan_add_port(priv->vxlan, port);
+	mlx5_vxlan_add_port(priv->mdev->vxlan, port);
 	mutex_unlock(&priv->state_lock);
 
 	kfree(vxlan_work);
@@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work)
 	u16 port = vxlan_work->port;
 
 	mutex_lock(&priv->state_lock);
-	mlx5_vxlan_del_port(priv->vxlan, port);
+	mlx5_vxlan_del_port(priv->mdev->vxlan, port);
 	mutex_unlock(&priv->state_lock);
 	kfree(vxlan_work);
 }
@@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device *netdev,
 	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
 		return;
 
-	if (!mlx5_vxlan_allowed(priv->vxlan))
+	if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
 		return;
 
 	mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
@@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device *netdev,
 	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
 		return;
 
-	if (!mlx5_vxlan_allowed(priv->vxlan))
+	if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
 		return;
 
 	mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
@@ -4076,7 +4076,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 		port = be16_to_cpu(udph->dest);
 
 		/* Verify if UDP port is being offloaded by HW */
-		if (mlx5_vxlan_lookup_port(priv->vxlan, port))
+		if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port))
 			return features;
 	}
 
@@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
 
-	if (mlx5_vxlan_allowed(priv->vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+	if (mlx5_vxlan_allowed(mdev->vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
 		netdev->hw_enc_features |= NETIF_F_IP_CSUM;
 		netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
 		netdev->hw_enc_features |= NETIF_F_TSO;
@@ -4656,7 +4656,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 		netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
 	}
 
-	if (mlx5_vxlan_allowed(priv->vxlan)) {
+	if (mlx5_vxlan_allowed(mdev->vxlan)) {
 		netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
 					   NETIF_F_GSO_UDP_TUNNEL_CSUM;
 		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL |
@@ -4758,8 +4758,6 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	int err;
 
-	priv->vxlan = mlx5_vxlan_create(mdev);
-
 	mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
 	err = mlx5e_ipsec_init(priv);
 	if (err)
@@ -4773,7 +4771,6 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 
 static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 {
-	mlx5_vxlan_destroy(priv->vxlan);
 	mlx5e_tls_cleanup(priv);
 	mlx5e_ipsec_cleanup(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 1b4931b62094..288a57f76e84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -50,7 +50,7 @@
 #include "en_rep.h"
 #include "en_tc.h"
 #include "eswitch.h"
-#include "vxlan.h"
+#include "lib/vxlan.h"
 #include "fs_core.h"
 #include "en/port.h"
 
@@ -1133,7 +1133,7 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
 		if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst)))
 			goto vxlan_match_offload_err;
 
-		if (mlx5_vxlan_lookup_port(up_priv->vxlan, be16_to_cpu(key->dst)) &&
+		if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, be16_to_cpu(key->dst)) &&
 		    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap))
 			parse_vxlan_attr(spec, f);
 		else {
@@ -2557,7 +2557,7 @@ vxlan_encap_offload_err:
 		return -EOPNOTSUPP;
 	}
 
-	if (mlx5_vxlan_lookup_port(up_priv->vxlan, be16_to_cpu(key->tp_dst)) &&
+	if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) &&
 	    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
 		tunnel_type = MLX5_HEADER_TYPE_VXLAN;
 	} else {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
new file mode 100644
index 000000000000..9a8fd762167b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "vxlan.h"
+
+struct mlx5_vxlan {
+	struct mlx5_core_dev		*mdev;
+	spinlock_t			lock; /* protect vxlan table */
+	/* max_num_ports is usuallly 4, 16 buckets is more than enough */
+	DECLARE_HASHTABLE(htable, 4);
+	int				num_ports;
+	struct mutex                    sync_lock; /* sync add/del port HW operations */
+};
+
+struct mlx5_vxlan_port {
+	struct hlist_node hlist;
+	atomic_t refcount;
+	u16 udp_port;
+};
+
+static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
+{
+	return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
+}
+
+static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(add_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT);
+	MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(delete_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT);
+	MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static struct mlx5_vxlan_port*
+mlx5_vxlan_lookup_port_locked(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+
+	hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) {
+		if (vxlanp->udp_port == port)
+			return vxlanp;
+	}
+
+	return NULL;
+}
+
+struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+
+	if (!mlx5_vxlan_allowed(vxlan))
+		return NULL;
+
+	spin_lock_bh(&vxlan->lock);
+	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
+	spin_unlock_bh(&vxlan->lock);
+
+	return vxlanp;
+}
+
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	int ret = -ENOSPC;
+
+	vxlanp = mlx5_vxlan_lookup_port(vxlan, port);
+	if (vxlanp) {
+		atomic_inc(&vxlanp->refcount);
+		return 0;
+	}
+
+	mutex_lock(&vxlan->sync_lock);
+	if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
+		mlx5_core_info(vxlan->mdev,
+			       "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n",
+			       port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
+	if (ret)
+		goto unlock;
+
+	vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
+	if (!vxlanp) {
+		ret = -ENOMEM;
+		goto err_delete_port;
+	}
+
+	vxlanp->udp_port = port;
+	atomic_set(&vxlanp->refcount, 1);
+
+	spin_lock_bh(&vxlan->lock);
+	hash_add(vxlan->htable, &vxlanp->hlist, port);
+	spin_unlock_bh(&vxlan->lock);
+
+	vxlan->num_ports++;
+	mutex_unlock(&vxlan->sync_lock);
+	return 0;
+
+err_delete_port:
+	mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+
+unlock:
+	mutex_unlock(&vxlan->sync_lock);
+	return ret;
+}
+
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	bool remove = false;
+	int ret = 0;
+
+	mutex_lock(&vxlan->sync_lock);
+
+	spin_lock_bh(&vxlan->lock);
+	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
+	if (!vxlanp) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (atomic_dec_and_test(&vxlanp->refcount)) {
+		hash_del(&vxlanp->hlist);
+		remove = true;
+	}
+
+out_unlock:
+	spin_unlock_bh(&vxlan->lock);
+
+	if (remove) {
+		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+		kfree(vxlanp);
+		vxlan->num_ports--;
+	}
+
+	mutex_unlock(&vxlan->sync_lock);
+
+	return ret;
+}
+
+struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_vxlan *vxlan;
+
+	if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev))
+		return ERR_PTR(-ENOTSUPP);
+
+	vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
+	if (!vxlan)
+		return ERR_PTR(-ENOMEM);
+
+	vxlan->mdev = mdev;
+	mutex_init(&vxlan->sync_lock);
+	spin_lock_init(&vxlan->lock);
+	hash_init(vxlan->htable);
+
+	/* Hardware adds 4789 by default */
+	mlx5_vxlan_add_port(vxlan, 4789);
+
+	return vxlan;
+}
+
+void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	struct hlist_node *tmp;
+	int bkt;
+
+	if (!mlx5_vxlan_allowed(vxlan))
+		return;
+
+	/* Lockless since we are the only hash table consumers*/
+	hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) {
+		hash_del(&vxlanp->hlist);
+		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port);
+		kfree(vxlanp);
+	}
+
+	kfree(vxlan);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
new file mode 100644
index 000000000000..fd874a30c4d0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_VXLAN_H__
+#define __MLX5_VXLAN_H__
+
+#include <linux/mlx5/driver.h>
+
+struct mlx5_vxlan;
+struct mlx5_vxlan_port;
+
+#ifdef CONFIG_MLX5_CORE_EN
+
+static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan)
+{
+	/* not allowed reason is encoded in vxlan pointer as error,
+	 * on mlx5_vxlan_create
+	 */
+	return !IS_ERR_OR_NULL(vxlan);
+}
+
+struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
+void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
+struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port);
+
+#else
+
+static inline struct mlx5_vxlan*
+mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-ENOTSUPP); }
+static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; }
+
+#endif
+
+#endif /* __MLX5_VXLAN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 6ddbb70e95de..03b9c6733eed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -62,6 +62,7 @@
 #include "accel/ipsec.h"
 #include "accel/tls.h"
 #include "lib/clock.h"
+#include "lib/vxlan.h"
 #include "diag/fw_tracer.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
@@ -961,6 +962,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 
 	mlx5_init_clock(dev);
 
+	dev->vxlan = mlx5_vxlan_create(dev);
+
 	err = mlx5_init_rl_table(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init rate limiting\n");
@@ -1004,6 +1007,7 @@ err_mpfs_cleanup:
 err_rl_cleanup:
 	mlx5_cleanup_rl_table(dev);
 err_tables_cleanup:
+	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);
@@ -1024,6 +1028,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
+	mlx5_vxlan_destroy(dev->vxlan);
 	mlx5_cleanup_clock(dev);
 	mlx5_cleanup_reserved_gids(dev);
 	mlx5_cleanup_mkey_table(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
deleted file mode 100644
index 9a8fd762167b..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mlx5/driver.h>
-#include "mlx5_core.h"
-#include "vxlan.h"
-
-struct mlx5_vxlan {
-	struct mlx5_core_dev		*mdev;
-	spinlock_t			lock; /* protect vxlan table */
-	/* max_num_ports is usuallly 4, 16 buckets is more than enough */
-	DECLARE_HASHTABLE(htable, 4);
-	int				num_ports;
-	struct mutex                    sync_lock; /* sync add/del port HW operations */
-};
-
-struct mlx5_vxlan_port {
-	struct hlist_node hlist;
-	atomic_t refcount;
-	u16 udp_port;
-};
-
-static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
-{
-	return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
-}
-
-static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
-{
-	u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0};
-
-	MLX5_SET(add_vxlan_udp_dport_in, in, opcode,
-		 MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT);
-	MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port);
-	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
-}
-
-static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port)
-{
-	u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0};
-
-	MLX5_SET(delete_vxlan_udp_dport_in, in, opcode,
-		 MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT);
-	MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port);
-	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
-}
-
-static struct mlx5_vxlan_port*
-mlx5_vxlan_lookup_port_locked(struct mlx5_vxlan *vxlan, u16 port)
-{
-	struct mlx5_vxlan_port *vxlanp;
-
-	hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) {
-		if (vxlanp->udp_port == port)
-			return vxlanp;
-	}
-
-	return NULL;
-}
-
-struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port)
-{
-	struct mlx5_vxlan_port *vxlanp;
-
-	if (!mlx5_vxlan_allowed(vxlan))
-		return NULL;
-
-	spin_lock_bh(&vxlan->lock);
-	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
-	spin_unlock_bh(&vxlan->lock);
-
-	return vxlanp;
-}
-
-int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
-{
-	struct mlx5_vxlan_port *vxlanp;
-	int ret = -ENOSPC;
-
-	vxlanp = mlx5_vxlan_lookup_port(vxlan, port);
-	if (vxlanp) {
-		atomic_inc(&vxlanp->refcount);
-		return 0;
-	}
-
-	mutex_lock(&vxlan->sync_lock);
-	if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
-		mlx5_core_info(vxlan->mdev,
-			       "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n",
-			       port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
-		ret = -ENOSPC;
-		goto unlock;
-	}
-
-	ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
-	if (ret)
-		goto unlock;
-
-	vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
-	if (!vxlanp) {
-		ret = -ENOMEM;
-		goto err_delete_port;
-	}
-
-	vxlanp->udp_port = port;
-	atomic_set(&vxlanp->refcount, 1);
-
-	spin_lock_bh(&vxlan->lock);
-	hash_add(vxlan->htable, &vxlanp->hlist, port);
-	spin_unlock_bh(&vxlan->lock);
-
-	vxlan->num_ports++;
-	mutex_unlock(&vxlan->sync_lock);
-	return 0;
-
-err_delete_port:
-	mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
-
-unlock:
-	mutex_unlock(&vxlan->sync_lock);
-	return ret;
-}
-
-int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
-{
-	struct mlx5_vxlan_port *vxlanp;
-	bool remove = false;
-	int ret = 0;
-
-	mutex_lock(&vxlan->sync_lock);
-
-	spin_lock_bh(&vxlan->lock);
-	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
-	if (!vxlanp) {
-		ret = -ENOENT;
-		goto out_unlock;
-	}
-
-	if (atomic_dec_and_test(&vxlanp->refcount)) {
-		hash_del(&vxlanp->hlist);
-		remove = true;
-	}
-
-out_unlock:
-	spin_unlock_bh(&vxlan->lock);
-
-	if (remove) {
-		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
-		kfree(vxlanp);
-		vxlan->num_ports--;
-	}
-
-	mutex_unlock(&vxlan->sync_lock);
-
-	return ret;
-}
-
-struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
-{
-	struct mlx5_vxlan *vxlan;
-
-	if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev))
-		return ERR_PTR(-ENOTSUPP);
-
-	vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
-	if (!vxlan)
-		return ERR_PTR(-ENOMEM);
-
-	vxlan->mdev = mdev;
-	mutex_init(&vxlan->sync_lock);
-	spin_lock_init(&vxlan->lock);
-	hash_init(vxlan->htable);
-
-	/* Hardware adds 4789 by default */
-	mlx5_vxlan_add_port(vxlan, 4789);
-
-	return vxlan;
-}
-
-void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan)
-{
-	struct mlx5_vxlan_port *vxlanp;
-	struct hlist_node *tmp;
-	int bkt;
-
-	if (!mlx5_vxlan_allowed(vxlan))
-		return;
-
-	/* Lockless since we are the only hash table consumers*/
-	hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) {
-		hash_del(&vxlanp->hlist);
-		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port);
-		kfree(vxlanp);
-	}
-
-	kfree(vxlan);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
deleted file mode 100644
index fd874a30c4d0..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __MLX5_VXLAN_H__
-#define __MLX5_VXLAN_H__
-
-#include <linux/mlx5/driver.h>
-
-struct mlx5_vxlan;
-struct mlx5_vxlan_port;
-
-#ifdef CONFIG_MLX5_CORE_EN
-
-static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan)
-{
-	/* not allowed reason is encoded in vxlan pointer as error,
-	 * on mlx5_vxlan_create
-	 */
-	return !IS_ERR_OR_NULL(vxlan);
-}
-
-struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
-void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
-int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
-int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
-struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port);
-
-#else
-
-static inline struct mlx5_vxlan*
-mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-ENOTSUPP); }
-static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; }
-
-#endif
-
-#endif /* __MLX5_VXLAN_H__ */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index fd0aaa5568fe..54f385cc8811 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -818,6 +818,7 @@ struct mlx5_clock {
 };
 
 struct mlx5_fw_tracer;
+struct mlx5_vxlan;
 
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
@@ -850,6 +851,7 @@ struct mlx5_core_dev {
 	atomic_t		num_qps;
 	u32			issi;
 	struct mlx5e_resources  mlx5e_res;
+	struct mlx5_vxlan       *vxlan;
 	struct {
 		struct mlx5_rsvd_gids	reserved_gids;
 		u32			roce_en;
-- 
cgit v1.2.3


From 627448e85c766587f6fdde1ea3886d6615081c77 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 28 Jun 2018 18:13:33 +0300
Subject: tpm: separate cmd_ready/go_idle from runtime_pm

Fix tpm ptt initialization error:
tpm tpm0: A TPM error (378) occurred get tpm pcr allocation.

We cannot use go_idle cmd_ready commands via runtime_pm handles
as with the introduction of localities this is no longer an optional
feature, while runtime pm can be not enabled.
Though cmd_ready/go_idle provides a power saving, it's also a part of
TPM2 protocol and should be called explicitly.
This patch exposes cmd_read/go_idle via tpm class ops and removes
runtime pm support as it is not used by any driver.

When calling from nested context always use both flags:
TPM_TRANSMIT_UNLOCKED and TPM_TRANSMIT_RAW. Both are needed to resolve
tpm spaces and locality request recursive calls to tpm_transmit().
TPM_TRANSMIT_RAW should never be used standalone as it will fail
on double locking. While TPM_TRANSMIT_UNLOCKED standalone should be
called from non-recursive locked contexts.

New wrappers are added tpm_cmd_ready() and tpm_go_idle() to
streamline tpm_try_transmit code.

tpm_crb no longer needs own power saving functions and can drop using
tpm_pm_suspend/resume.

This patch cannot be really separated from the locality fix.
Fixes: 888d867df441 (tpm: cmd_ready command can be issued only after granting locality)

Cc: stable@vger.kernel.org
Fixes: 888d867df441 (tpm: cmd_ready command can be issued only after granting locality)
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Tested-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-interface.c |  51 ++++++++++++++++----
 drivers/char/tpm/tpm.h           |  12 ++++-
 drivers/char/tpm/tpm2-space.c    |  16 ++++---
 drivers/char/tpm/tpm_crb.c       | 101 +++++++++++----------------------------
 include/linux/tpm.h              |   2 +
 5 files changed, 90 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index e32f6e85dc6d..31b86e027f9d 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -29,7 +29,6 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/pm_runtime.h>
 #include <linux/tpm_eventlog.h>
 
 #include "tpm.h"
@@ -369,10 +368,13 @@ err_len:
 	return -EINVAL;
 }
 
-static int tpm_request_locality(struct tpm_chip *chip)
+static int tpm_request_locality(struct tpm_chip *chip, unsigned int flags)
 {
 	int rc;
 
+	if (flags & TPM_TRANSMIT_RAW)
+		return 0;
+
 	if (!chip->ops->request_locality)
 		return 0;
 
@@ -385,10 +387,13 @@ static int tpm_request_locality(struct tpm_chip *chip)
 	return 0;
 }
 
-static void tpm_relinquish_locality(struct tpm_chip *chip)
+static void tpm_relinquish_locality(struct tpm_chip *chip, unsigned int flags)
 {
 	int rc;
 
+	if (flags & TPM_TRANSMIT_RAW)
+		return;
+
 	if (!chip->ops->relinquish_locality)
 		return;
 
@@ -399,6 +404,28 @@ static void tpm_relinquish_locality(struct tpm_chip *chip)
 	chip->locality = -1;
 }
 
+static int tpm_cmd_ready(struct tpm_chip *chip, unsigned int flags)
+{
+	if (flags & TPM_TRANSMIT_RAW)
+		return 0;
+
+	if (!chip->ops->cmd_ready)
+		return 0;
+
+	return chip->ops->cmd_ready(chip);
+}
+
+static int tpm_go_idle(struct tpm_chip *chip, unsigned int flags)
+{
+	if (flags & TPM_TRANSMIT_RAW)
+		return 0;
+
+	if (!chip->ops->go_idle)
+		return 0;
+
+	return chip->ops->go_idle(chip);
+}
+
 static ssize_t tpm_try_transmit(struct tpm_chip *chip,
 				struct tpm_space *space,
 				u8 *buf, size_t bufsiz,
@@ -449,14 +476,15 @@ static ssize_t tpm_try_transmit(struct tpm_chip *chip,
 	/* Store the decision as chip->locality will be changed. */
 	need_locality = chip->locality == -1;
 
-	if (!(flags & TPM_TRANSMIT_RAW) && need_locality) {
-		rc = tpm_request_locality(chip);
+	if (need_locality) {
+		rc = tpm_request_locality(chip, flags);
 		if (rc < 0)
 			goto out_no_locality;
 	}
 
-	if (chip->dev.parent)
-		pm_runtime_get_sync(chip->dev.parent);
+	rc = tpm_cmd_ready(chip, flags);
+	if (rc)
+		goto out;
 
 	rc = tpm2_prepare_space(chip, space, ordinal, buf);
 	if (rc)
@@ -516,13 +544,16 @@ out_recv:
 	}
 
 	rc = tpm2_commit_space(chip, space, ordinal, buf, &len);
+	if (rc)
+		dev_err(&chip->dev, "tpm2_commit_space: error %d\n", rc);
 
 out:
-	if (chip->dev.parent)
-		pm_runtime_put_sync(chip->dev.parent);
+	rc = tpm_go_idle(chip, flags);
+	if (rc)
+		goto out;
 
 	if (need_locality)
-		tpm_relinquish_locality(chip);
+		tpm_relinquish_locality(chip, flags);
 
 out_no_locality:
 	if (chip->ops->clk_enable != NULL)
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 9824cccb2c76..17833ef63f6c 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -512,9 +512,17 @@ extern const struct file_operations tpm_fops;
 extern const struct file_operations tpmrm_fops;
 extern struct idr dev_nums_idr;
 
+/**
+ * enum tpm_transmit_flags
+ *
+ * @TPM_TRANSMIT_UNLOCKED: used to lock sequence of tpm_transmit calls.
+ * @TPM_TRANSMIT_RAW: prevent recursive calls into setup steps
+ *                    (go idle, locality,..). Always use with UNLOCKED
+ *                    as it will fail on double locking.
+ */
 enum tpm_transmit_flags {
-	TPM_TRANSMIT_UNLOCKED	= BIT(0),
-	TPM_TRANSMIT_RAW	= BIT(1),
+	TPM_TRANSMIT_UNLOCKED = BIT(0),
+	TPM_TRANSMIT_RAW      = BIT(1),
 };
 
 ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space,
diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c
index 6122d3276f72..11c85ed8c113 100644
--- a/drivers/char/tpm/tpm2-space.c
+++ b/drivers/char/tpm/tpm2-space.c
@@ -39,7 +39,8 @@ static void tpm2_flush_sessions(struct tpm_chip *chip, struct tpm_space *space)
 	for (i = 0; i < ARRAY_SIZE(space->session_tbl); i++) {
 		if (space->session_tbl[i])
 			tpm2_flush_context_cmd(chip, space->session_tbl[i],
-					       TPM_TRANSMIT_UNLOCKED);
+					       TPM_TRANSMIT_UNLOCKED |
+					       TPM_TRANSMIT_RAW);
 	}
 }
 
@@ -84,7 +85,7 @@ static int tpm2_load_context(struct tpm_chip *chip, u8 *buf,
 	tpm_buf_append(&tbuf, &buf[*offset], body_size);
 
 	rc = tpm_transmit_cmd(chip, NULL, tbuf.data, PAGE_SIZE, 4,
-			      TPM_TRANSMIT_UNLOCKED, NULL);
+			      TPM_TRANSMIT_UNLOCKED | TPM_TRANSMIT_RAW, NULL);
 	if (rc < 0) {
 		dev_warn(&chip->dev, "%s: failed with a system error %d\n",
 			 __func__, rc);
@@ -133,7 +134,7 @@ static int tpm2_save_context(struct tpm_chip *chip, u32 handle, u8 *buf,
 	tpm_buf_append_u32(&tbuf, handle);
 
 	rc = tpm_transmit_cmd(chip, NULL, tbuf.data, PAGE_SIZE, 0,
-			      TPM_TRANSMIT_UNLOCKED, NULL);
+			      TPM_TRANSMIT_UNLOCKED | TPM_TRANSMIT_RAW, NULL);
 	if (rc < 0) {
 		dev_warn(&chip->dev, "%s: failed with a system error %d\n",
 			 __func__, rc);
@@ -170,7 +171,8 @@ static void tpm2_flush_space(struct tpm_chip *chip)
 	for (i = 0; i < ARRAY_SIZE(space->context_tbl); i++)
 		if (space->context_tbl[i] && ~space->context_tbl[i])
 			tpm2_flush_context_cmd(chip, space->context_tbl[i],
-					       TPM_TRANSMIT_UNLOCKED);
+					       TPM_TRANSMIT_UNLOCKED |
+					       TPM_TRANSMIT_RAW);
 
 	tpm2_flush_sessions(chip, space);
 }
@@ -377,7 +379,8 @@ static int tpm2_map_response_header(struct tpm_chip *chip, u32 cc, u8 *rsp,
 
 	return 0;
 out_no_slots:
-	tpm2_flush_context_cmd(chip, phandle, TPM_TRANSMIT_UNLOCKED);
+	tpm2_flush_context_cmd(chip, phandle,
+			       TPM_TRANSMIT_UNLOCKED | TPM_TRANSMIT_RAW);
 	dev_warn(&chip->dev, "%s: out of slots for 0x%08X\n", __func__,
 		 phandle);
 	return -ENOMEM;
@@ -465,7 +468,8 @@ static int tpm2_save_space(struct tpm_chip *chip)
 			return rc;
 
 		tpm2_flush_context_cmd(chip, space->context_tbl[i],
-				       TPM_TRANSMIT_UNLOCKED);
+				       TPM_TRANSMIT_UNLOCKED |
+				       TPM_TRANSMIT_RAW);
 		space->context_tbl[i] = ~0;
 	}
 
diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
index 34fbc6cb097b..36952ef98f90 100644
--- a/drivers/char/tpm/tpm_crb.c
+++ b/drivers/char/tpm/tpm_crb.c
@@ -132,7 +132,7 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value,
 }
 
 /**
- * crb_go_idle - request tpm crb device to go the idle state
+ * __crb_go_idle - request tpm crb device to go the idle state
  *
  * @dev:  crb device
  * @priv: crb private data
@@ -147,7 +147,7 @@ static bool crb_wait_for_reg_32(u32 __iomem *reg, u32 mask, u32 value,
  *
  * Return: 0 always
  */
-static int crb_go_idle(struct device *dev, struct crb_priv *priv)
+static int __crb_go_idle(struct device *dev, struct crb_priv *priv)
 {
 	if ((priv->sm == ACPI_TPM2_START_METHOD) ||
 	    (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) ||
@@ -163,11 +163,20 @@ static int crb_go_idle(struct device *dev, struct crb_priv *priv)
 		dev_warn(dev, "goIdle timed out\n");
 		return -ETIME;
 	}
+
 	return 0;
 }
 
+static int crb_go_idle(struct tpm_chip *chip)
+{
+	struct device *dev = &chip->dev;
+	struct crb_priv *priv = dev_get_drvdata(dev);
+
+	return __crb_go_idle(dev, priv);
+}
+
 /**
- * crb_cmd_ready - request tpm crb device to enter ready state
+ * __crb_cmd_ready - request tpm crb device to enter ready state
  *
  * @dev:  crb device
  * @priv: crb private data
@@ -181,7 +190,7 @@ static int crb_go_idle(struct device *dev, struct crb_priv *priv)
  *
  * Return: 0 on success -ETIME on timeout;
  */
-static int crb_cmd_ready(struct device *dev, struct crb_priv *priv)
+static int __crb_cmd_ready(struct device *dev, struct crb_priv *priv)
 {
 	if ((priv->sm == ACPI_TPM2_START_METHOD) ||
 	    (priv->sm == ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD) ||
@@ -200,6 +209,14 @@ static int crb_cmd_ready(struct device *dev, struct crb_priv *priv)
 	return 0;
 }
 
+static int crb_cmd_ready(struct tpm_chip *chip)
+{
+	struct device *dev = &chip->dev;
+	struct crb_priv *priv = dev_get_drvdata(dev);
+
+	return __crb_cmd_ready(dev, priv);
+}
+
 static int __crb_request_locality(struct device *dev,
 				  struct crb_priv *priv, int loc)
 {
@@ -401,6 +418,8 @@ static const struct tpm_class_ops tpm_crb = {
 	.send = crb_send,
 	.cancel = crb_cancel,
 	.req_canceled = crb_req_canceled,
+	.go_idle  = crb_go_idle,
+	.cmd_ready = crb_cmd_ready,
 	.request_locality = crb_request_locality,
 	.relinquish_locality = crb_relinquish_locality,
 	.req_complete_mask = CRB_DRV_STS_COMPLETE,
@@ -520,7 +539,7 @@ static int crb_map_io(struct acpi_device *device, struct crb_priv *priv,
 	 * PTT HW bug w/a: wake up the device to access
 	 * possibly not retained registers.
 	 */
-	ret = crb_cmd_ready(dev, priv);
+	ret = __crb_cmd_ready(dev, priv);
 	if (ret)
 		goto out_relinquish_locality;
 
@@ -565,7 +584,7 @@ out:
 	if (!ret)
 		priv->cmd_size = cmd_size;
 
-	crb_go_idle(dev, priv);
+	__crb_go_idle(dev, priv);
 
 out_relinquish_locality:
 
@@ -628,32 +647,7 @@ static int crb_acpi_add(struct acpi_device *device)
 	chip->acpi_dev_handle = device->handle;
 	chip->flags = TPM_CHIP_FLAG_TPM2;
 
-	rc = __crb_request_locality(dev, priv, 0);
-	if (rc)
-		return rc;
-
-	rc  = crb_cmd_ready(dev, priv);
-	if (rc)
-		goto out;
-
-	pm_runtime_get_noresume(dev);
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-	rc = tpm_chip_register(chip);
-	if (rc) {
-		crb_go_idle(dev, priv);
-		pm_runtime_put_noidle(dev);
-		pm_runtime_disable(dev);
-		goto out;
-	}
-
-	pm_runtime_put_sync(dev);
-
-out:
-	__crb_relinquish_locality(dev, priv, 0);
-
-	return rc;
+	return tpm_chip_register(chip);
 }
 
 static int crb_acpi_remove(struct acpi_device *device)
@@ -663,52 +657,11 @@ static int crb_acpi_remove(struct acpi_device *device)
 
 	tpm_chip_unregister(chip);
 
-	pm_runtime_disable(dev);
-
 	return 0;
 }
 
-static int __maybe_unused crb_pm_runtime_suspend(struct device *dev)
-{
-	struct tpm_chip *chip = dev_get_drvdata(dev);
-	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
-
-	return crb_go_idle(dev, priv);
-}
-
-static int __maybe_unused crb_pm_runtime_resume(struct device *dev)
-{
-	struct tpm_chip *chip = dev_get_drvdata(dev);
-	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
-
-	return crb_cmd_ready(dev, priv);
-}
-
-static int __maybe_unused crb_pm_suspend(struct device *dev)
-{
-	int ret;
-
-	ret = tpm_pm_suspend(dev);
-	if (ret)
-		return ret;
-
-	return crb_pm_runtime_suspend(dev);
-}
-
-static int __maybe_unused crb_pm_resume(struct device *dev)
-{
-	int ret;
-
-	ret = crb_pm_runtime_resume(dev);
-	if (ret)
-		return ret;
-
-	return tpm_pm_resume(dev);
-}
-
 static const struct dev_pm_ops crb_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(crb_pm_suspend, crb_pm_resume)
-	SET_RUNTIME_PM_OPS(crb_pm_runtime_suspend, crb_pm_runtime_resume, NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(tpm_pm_suspend, tpm_pm_resume)
 };
 
 static const struct acpi_device_id crb_device_ids[] = {
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 06639fb6ab85..8eb5e5ebe136 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -43,6 +43,8 @@ struct tpm_class_ops {
 	u8 (*status) (struct tpm_chip *chip);
 	bool (*update_timeouts)(struct tpm_chip *chip,
 				unsigned long *timeout_cap);
+	int (*go_idle)(struct tpm_chip *chip);
+	int (*cmd_ready)(struct tpm_chip *chip);
 	int (*request_locality)(struct tpm_chip *chip, int loc);
 	int (*relinquish_locality)(struct tpm_chip *chip, int loc);
 	void (*clk_enable)(struct tpm_chip *chip, bool value);
-- 
cgit v1.2.3


From aaae81536351f831fe6719e5b4e6afbadc5e5f85 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.vnet.ibm.com>
Date: Tue, 26 Jun 2018 15:09:30 -0400
Subject: tpm: Implement tpm_default_chip() to find a TPM chip

Implement tpm_default_chip() to find the first TPM chip and return it to
the caller while increasing the reference count on its device. This
function can be used by other subsystems, such as IMA, to find the system's
default TPM chip and use it for all subsequent TPM operations.

Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 drivers/char/tpm/tpm-chip.c | 27 +++++++++++++++++++++++++++
 include/linux/tpm.h         |  5 +++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 242b716aed5e..f551061262c9 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -80,6 +80,33 @@ void tpm_put_ops(struct tpm_chip *chip)
 }
 EXPORT_SYMBOL_GPL(tpm_put_ops);
 
+/**
+ * tpm_default_chip() - find a TPM chip and get a reference to it
+ */
+struct tpm_chip *tpm_default_chip(void)
+{
+	struct tpm_chip *chip, *res = NULL;
+	int chip_num = 0;
+	int chip_prev;
+
+	mutex_lock(&idr_lock);
+
+	do {
+		chip_prev = chip_num;
+		chip = idr_get_next(&dev_nums_idr, &chip_num);
+		if (chip) {
+			get_device(&chip->dev);
+			res = chip;
+			break;
+		}
+	} while (chip_prev != chip_num);
+
+	mutex_unlock(&idr_lock);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(tpm_default_chip);
+
 /**
  * tpm_find_get_ops() - find and reserve a TPM chip
  * @chip:	a &struct tpm_chip instance, %NULL for the default chip
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 8eb5e5ebe136..4609b94142d4 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -63,6 +63,7 @@ extern int tpm_seal_trusted(struct tpm_chip *chip,
 extern int tpm_unseal_trusted(struct tpm_chip *chip,
 			      struct trusted_key_payload *payload,
 			      struct trusted_key_options *options);
+extern struct tpm_chip *tpm_default_chip(void);
 #else
 static inline int tpm_is_tpm2(struct tpm_chip *chip)
 {
@@ -98,5 +99,9 @@ static inline int tpm_unseal_trusted(struct tpm_chip *chip,
 {
 	return -ENODEV;
 }
+static inline struct tpm_chip *tpm_default_chip(void)
+{
+	return NULL;
+}
 #endif
 #endif
-- 
cgit v1.2.3


From 5cbf777cfdf6e5a7b7149006e4881a255da78fdd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 27 Jul 2018 16:37:28 +0800
Subject: route: add support for directed broadcast forwarding

This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl bc_forwarding is enabled.

Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
  - target TEE's gateway param has to be set with a specific address,
    and it's not flexible especially when the route wants forward all
    directed broadcasts.
  - this duplicates the directed broadcasts so this may cause side
    effects to applications.

Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.

Note that route cache needs to be flushed when bc_forwarding is
changed.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h   |  1 +
 include/uapi/linux/ip.h      |  1 +
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 11 +++++++++++
 net/ipv4/route.c             |  6 +++++-
 5 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1bff3d..c759d1cbcedd 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 #define IN_DEV_FORWARD(in_dev)		IN_DEV_CONF_GET((in_dev), FORWARDING)
 #define IN_DEV_MFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
 #define IN_DEV_RPFILTER(in_dev)		IN_DEV_MAXCONF((in_dev), RP_FILTER)
 #define IN_DEV_SRC_VMARK(in_dev)    	IN_DEV_ORCONF((in_dev), SRC_VMARK)
 #define IN_DEV_SOURCE_ROUTE(in_dev)	IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742beae5..e42d13b55cf3 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -168,6 +168,7 @@ enum
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
+	IPV4_DEVCONF_BC_FORWARDING,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdfca862..fac4edd55379 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -18,6 +18,7 @@ enum {
 	NETCONFA_PROXY_NEIGH,
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 	NETCONFA_INPUT,
+	NETCONFA_BC_FORWARDING,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..ea4bd8a52422 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
+	if (all || type == NETCONFA_BC_FORWARDING)
+		size += nla_total_size(4);
 	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
+	if ((all || type == NETCONFA_BC_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+		goto nla_put_failure;
 	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
 			if ((new_value == 0) && (old_value != 0))
 				rt_cache_flush(net);
 
+		if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+		    new_value != old_value)
+			rt_cache_flush(net);
+
 		if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
 		    new_value != old_value) {
 			ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
 					     devinet_sysctl_forward),
 		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+		DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
 
 		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
 		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..b678466da451 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto no_route;
 	}
 
-	if (res->type == RTN_BROADCAST)
+	if (res->type == RTN_BROADCAST) {
+		if (IN_DEV_BFORWARD(in_dev))
+			goto make_route;
 		goto brd_input;
+	}
 
 	if (res->type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res->type != RTN_UNICAST)
 		goto martian_destination;
 
+make_route:
 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
 out:	return err;
 
-- 
cgit v1.2.3


From 7a4c53bee3324ac00bf964aa2f82d15d279e86e4 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 27 Jul 2018 13:43:23 -0700
Subject: net: report invalid mtu value via netlink extack

If an invalid MTU value is set through rtnetlink return extra error
information instead of putting message in kernel log. For other cases
where there is no visible API, keep the error report in the log.

Example:
	# ip li set dev enp12s0 mtu 10000
	Error: mtu greater than device maximum.

	# ifconfig enp12s0 mtu 10000
	SIOCSIFMTU: Invalid argument
	# dmesg | tail -1
	[ 2047.795467] enp12s0: mtu greater than device maximum

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 23 +++++++++++++++++------
 net/core/rtnetlink.c      |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c1295c7a452e..9c917467a2c7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
+int dev_set_mtu_ext(struct net_device *dev, int mtu,
+		    struct netlink_ext_ack *extack);
 int dev_set_mtu(struct net_device *, int);
 int dev_change_tx_queue_len(struct net_device *, unsigned long);
 void dev_set_group(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index 87c42c8249ae..89031b5fef9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7523,13 +7523,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
 EXPORT_SYMBOL(__dev_set_mtu);
 
 /**
- *	dev_set_mtu - Change maximum transfer unit
+ *	dev_set_mtu_ext - Change maximum transfer unit
  *	@dev: device
  *	@new_mtu: new transfer unit
+ *	@extack: netlink extended ack
  *
  *	Change the maximum transfer size of the network device.
  */
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+		    struct netlink_ext_ack *extack)
 {
 	int err, orig_mtu;
 
@@ -7538,14 +7540,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 
 	/* MTU must be positive, and in range */
 	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
-		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
-				    dev->name, new_mtu, dev->min_mtu);
+		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 		return -EINVAL;
 	}
 
 	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
-		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
-				    dev->name, new_mtu, dev->max_mtu);
+		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 		return -EINVAL;
 	}
 
@@ -7573,6 +7573,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	}
 	return err;
 }
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+	struct netlink_ext_ack extack;
+	int err;
+
+	err = dev_set_mtu_ext(dev, new_mtu, &extack);
+	if (err)
+		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+	return err;
+}
 EXPORT_SYMBOL(dev_set_mtu);
 
 /**
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 510d4f765a13..24431e578310 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2382,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_MTU]) {
-		err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+		err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
 		if (err < 0)
 			goto errout;
 		status |= DO_SETLINK_MODIFIED;
-- 
cgit v1.2.3


From 0969a204bfdaf7470d2666736b90a8595ae671e9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 27 Jul 2018 17:47:01 +0300
Subject: gpiolib: Use GPIOD_OUT_{LOW,HIGH} macros in open drain ones

There should not be anything more than stated by the name of newly
introduced constants, i.e.
	GPIOD_OUT_LOW_OPEN_DRAIN == GPIOD_OUT_LOW + open drain
and nothing more.

Make it better to read and slightly more robust by using GPIOD_OUT_LOW
and GPIOD_OUT_HIGH constants with open drain flag.

No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index e8aaf34dd65d..21ddbe440030 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -41,11 +41,8 @@ enum gpiod_flags {
 	GPIOD_OUT_LOW	= GPIOD_FLAGS_BIT_DIR_SET | GPIOD_FLAGS_BIT_DIR_OUT,
 	GPIOD_OUT_HIGH	= GPIOD_FLAGS_BIT_DIR_SET | GPIOD_FLAGS_BIT_DIR_OUT |
 			  GPIOD_FLAGS_BIT_DIR_VAL,
-	GPIOD_OUT_LOW_OPEN_DRAIN = GPIOD_FLAGS_BIT_DIR_SET |
-			  GPIOD_FLAGS_BIT_DIR_OUT | GPIOD_FLAGS_BIT_OPEN_DRAIN,
-	GPIOD_OUT_HIGH_OPEN_DRAIN = GPIOD_FLAGS_BIT_DIR_SET |
-			  GPIOD_FLAGS_BIT_DIR_OUT | GPIOD_FLAGS_BIT_DIR_VAL |
-			  GPIOD_FLAGS_BIT_OPEN_DRAIN,
+	GPIOD_OUT_LOW_OPEN_DRAIN = GPIOD_OUT_LOW | GPIOD_FLAGS_BIT_OPEN_DRAIN,
+	GPIOD_OUT_HIGH_OPEN_DRAIN = GPIOD_OUT_HIGH | GPIOD_FLAGS_BIT_OPEN_DRAIN,
 };
 
 #ifdef CONFIG_GPIOLIB
-- 
cgit v1.2.3


From 51c23b47e6b8590ea7a6a6776ffb21810ece73bf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 13 Jul 2018 14:54:45 +0200
Subject: netfilter: nf_osf: add nf_osf_find()

This new function returns the OS genre as a string. Plan is to use to
from the new nft_osf extension.

Note that this doesn't yet support ttl options, but it could be easily
extended to do so.

Tested-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h |  9 +++++++++
 net/netfilter/nf_osf.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index 0e114c492fb8..aee460fcbd31 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -1,3 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFOSF_H
+#define _NFOSF_H
+
 #include <uapi/linux/netfilter/nf_osf.h>
 
 /* Initial window size option state machine: multiple of mss, mtu or
@@ -31,3 +35,8 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  int hooknum, struct net_device *in, struct net_device *out,
 		  const struct nf_osf_info *info, struct net *net,
 		  const struct list_head *nf_osf_fingers);
+
+const char *nf_osf_find(const struct sk_buff *skb,
+                        const struct list_head *nf_osf_fingers);
+
+#endif /* _NFOSF_H */
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
index b44d62d5d9a9..f4c75e982902 100644
--- a/net/netfilter/nf_osf.c
+++ b/net/netfilter/nf_osf.c
@@ -249,4 +249,34 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 }
 EXPORT_SYMBOL_GPL(nf_osf_match);
 
+const char *nf_osf_find(const struct sk_buff *skb,
+			const struct list_head *nf_osf_fingers)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct nf_osf_user_finger *f;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct nf_osf_finger *kf;
+	struct nf_osf_hdr_ctx ctx;
+	const struct tcphdr *tcp;
+	const char *genre = NULL;
+
+	memset(&ctx, 0, sizeof(ctx));
+
+	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+	if (!tcp)
+		return false;
+
+	list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
+		f = &kf->finger;
+		if (!nf_osf_match_one(skb, f, -1, &ctx))
+			continue;
+
+		genre = f->genre;
+		break;
+	}
+
+	return genre;
+}
+EXPORT_SYMBOL_GPL(nf_osf_find);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f609e7a0f3956e3b03af735b83600a4eef2a04ec Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 7 Jun 2018 15:55:25 -0400
Subject: media: sii9234: remove unused header

The include/linux/platform_data/media/sii9234.h header file is unused,
let's remove it.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 include/linux/platform_data/media/sii9234.h | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 include/linux/platform_data/media/sii9234.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/media/sii9234.h b/include/linux/platform_data/media/sii9234.h
deleted file mode 100644
index 6a4a809fe9a3..000000000000
--- a/include/linux/platform_data/media/sii9234.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Driver header for SII9234 MHL converter chip.
- *
- * Copyright (c) 2011 Samsung Electronics, Co. Ltd
- * Contact: Tomasz Stanislawski <t.stanislaws@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef SII9234_H
-#define SII9234_H
-
-/**
- * @gpio_n_reset: GPIO driving nRESET pin
- */
-
-struct sii9234_platform_data {
-	int gpio_n_reset;
-};
-
-#endif /* SII9234_H */
-- 
cgit v1.2.3


From 489cae632fc04927e8ef36ac8d8847193a41df3b Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Tue, 24 Jul 2018 10:33:19 -0600
Subject: include: Move ascii85 functions from i915 to linux/ascii85.h

The i915 DRM driver very cleverly used ascii85 encoding for their
GPU state file. Move the encode functions to a general header file to
support other drivers that might be interested in the same
functionality.

v4: Make the return value const char * as suggested by Chris Wilson
v3: Fix error_puts -> err_puts pointed out by the 01.org bot
v2: Update API to be cleaner for the caller as suggested by Chris Wilson

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
Signed-off-by: Rob Clark <robdclark@gmail.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 34 ++++---------------------------
 include/linux/ascii85.h               | 38 +++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 30 deletions(-)
 create mode 100644 include/linux/ascii85.h

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8c81cf3aa182..f7f2aa71d8d9 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -31,6 +31,7 @@
 #include <linux/stop_machine.h>
 #include <linux/zlib.h>
 #include <drm/drm_print.h>
+#include <linux/ascii85.h>
 
 #include "i915_gpu_error.h"
 #include "i915_drv.h"
@@ -517,35 +518,12 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 	va_end(args);
 }
 
-static int
-ascii85_encode_len(int len)
-{
-	return DIV_ROUND_UP(len, 4);
-}
-
-static bool
-ascii85_encode(u32 in, char *out)
-{
-	int i;
-
-	if (in == 0)
-		return false;
-
-	out[5] = '\0';
-	for (i = 5; i--; ) {
-		out[i] = '!' + in % 85;
-		in /= 85;
-	}
-
-	return true;
-}
-
 static void print_error_obj(struct drm_i915_error_state_buf *m,
 			    struct intel_engine_cs *engine,
 			    const char *name,
 			    struct drm_i915_error_object *obj)
 {
-	char out[6];
+	char out[ASCII85_BUFSZ];
 	int page;
 
 	if (!obj)
@@ -567,12 +545,8 @@ static void print_error_obj(struct drm_i915_error_state_buf *m,
 			len -= obj->unused;
 		len = ascii85_encode_len(len);
 
-		for (i = 0; i < len; i++) {
-			if (ascii85_encode(obj->pages[page][i], out))
-				err_puts(m, out);
-			else
-				err_puts(m, "z");
-		}
+		for (i = 0; i < len; i++)
+			err_puts(m, ascii85_encode(obj->pages[page][i], out));
 	}
 	err_puts(m, "\n");
 }
diff --git a/include/linux/ascii85.h b/include/linux/ascii85.h
new file mode 100644
index 000000000000..4cc40201273e
--- /dev/null
+++ b/include/linux/ascii85.h
@@ -0,0 +1,38 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
+ */
+
+#ifndef _ASCII85_H_
+#define _ASCII85_H_
+
+#include <linux/kernel.h>
+
+#define ASCII85_BUFSZ 6
+
+static inline long
+ascii85_encode_len(long len)
+{
+	return DIV_ROUND_UP(len, 4);
+}
+
+static inline const char *
+ascii85_encode(u32 in, char *out)
+{
+	int i;
+
+	if (in == 0)
+		return "z";
+
+	out[5] = '\0';
+	for (i = 5; i--; ) {
+		out[i] = '!' + in % 85;
+		in /= 85;
+	}
+
+	return out;
+}
+
+#endif
-- 
cgit v1.2.3


From c454edc21b12dd7d416de6c81555e87aaec9685c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 30 Jul 2018 10:10:01 -0400
Subject: block: don't account for split bio's size in cgroup stats

We need to check in blkcg_bio_issue_check if the bio is flagged as
QUEUE_ENTERED, because if it is then we've already accounted for the
size of the IO in the cgroup stats.  We can still however account for
the extra IO since it'll be another request.

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 3bed5e02a873..f7b910768306 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -769,8 +769,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	if (!throtl) {
 		blkg = blkg ?: q->root_blkg;
-		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-				bio->bi_iter.bi_size);
+		/*
+		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
+		 * is a split bio and we would have already accounted for the
+		 * size of the bio.
+		 */
+		if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
+			blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
+					bio->bi_iter.bi_size);
 		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
-- 
cgit v1.2.3


From ddd0bc756983dc4d19000a4fe021b4c7f9d59aab Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 30 Jul 2018 00:15:31 +0300
Subject: block: move ref_tag calculation func to the block layer

Currently this function is implemented in the scsi layer, but it's
actual place should be the block layer since T10-PI is a general
data integrity feature that is used in the nvme protocol as well.

Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/infiniband/ulp/iser/iser_memory.c |  2 +-
 drivers/nvme/host/core.c                  |  3 +--
 drivers/scsi/mpt3sas/mpt3sas_scsih.c      |  2 +-
 drivers/scsi/sd_dif.c                     |  4 ++--
 include/linux/t10-pi.h                    | 10 ++++++++++
 include/scsi/scsi_cmnd.h                  |  7 +------
 6 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index ca844a926e6a..130bf163f066 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
 {
 	domain->sig_type = IB_SIG_TYPE_T10_DIF;
 	domain->sig.dif.pi_interval = scsi_prot_interval(sc);
-	domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+	domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
 	/*
 	 * At the moment we hard code those, but in the future
 	 * we will take them from sc.
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e77e6418a21c..16c8b86fe95d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -611,8 +611,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		case NVME_NS_DPS_PI_TYPE2:
 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
 					NVME_RW_PRINFO_PRCHK_REF;
-			cmnd->rw.reftag = cpu_to_le32(
-					nvme_block_nr(ns, blk_rq_pos(req)));
+			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 			break;
 		}
 	}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index b8d131a455d0..dd738ae5c75b 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
 		    MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
 		    MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
 		mpi_request->CDB.EEDP32.PrimaryReferenceTag =
-		    cpu_to_be32(scsi_prot_ref_tag(scmd));
+		    cpu_to_be32(t10_pi_ref_tag(scmd->request));
 		break;
 
 	case SCSI_PROT_DIF_TYPE3:
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 9035380c0dda..d8de43d359ac 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -124,7 +124,7 @@ void sd_dif_prepare(struct scsi_cmnd *scmd)
 	if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
 		return;
 
-	phys = scsi_prot_ref_tag(scmd);
+	phys = t10_pi_ref_tag(scmd->request);
 
 	__rq_for_each_bio(bio, scmd->request) {
 		struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -176,7 +176,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
 		return;
 
 	intervals = good_bytes / scsi_prot_interval(scmd);
-	phys = scsi_prot_ref_tag(scmd);
+	phys = t10_pi_ref_tag(scmd->request);
 
 	__rq_for_each_bio(bio, scmd->request) {
 		struct bio_integrity_payload *bip = bio_integrity(bio);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index c6aa8a3c42ed..c40511f4e63d 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -37,6 +37,16 @@ struct t10_pi_tuple {
 #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff)
 #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff)
 
+static inline u32 t10_pi_ref_tag(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	return blk_rq_pos(rq) >>
+		(rq->q->integrity.interval_exp - 9) & 0xffffffff;
+#else
+	return -1U;
+#endif
+}
+
 extern const struct blk_integrity_profile t10_pi_type1_crc;
 extern const struct blk_integrity_profile t10_pi_type1_ip;
 extern const struct blk_integrity_profile t10_pi_type3_crc;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index aaf1e971c6a3..cae229b5395c 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
+#include <linux/t10-pi.h>
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/timer.h>
@@ -313,12 +314,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
 	return scmd->device->sector_size;
 }
 
-static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
-{
-	return blk_rq_pos(scmd->request) >>
-		(ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff;
-}
-
 static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
 {
 	return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
-- 
cgit v1.2.3


From 10c41ddd61323b27b447bc8e18296ac6c06107ad Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 30 Jul 2018 00:15:32 +0300
Subject: block: move dif_prepare/dif_complete functions to block layer

Currently these functions are implemented in the scsi layer, but their
actual place should be the block layer since T10-PI is a general data
integrity feature that is used in the nvme protocol as well. Also, use
the tuple size from the integrity profile since it may vary between
integrity types.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/t10-pi.c         | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/sd.c      |   8 ++--
 drivers/scsi/sd.h      |   9 ----
 drivers/scsi/sd_dif.c  | 113 -------------------------------------------------
 include/linux/t10-pi.h |   3 ++
 5 files changed, 118 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/block/t10-pi.c b/block/t10-pi.c
index a98db384048f..62aed77d0bb9 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
 	.verify_fn		= t10_pi_type3_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type3_ip);
+
+/**
+ * t10_pi_prepare - prepare PI prior submitting request to device
+ * @rq:              request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Remap protection information to match the
+ * physical LBA.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+	const int tuple_sz = rq->q->integrity.tuple_size;
+	u32 ref_tag = t10_pi_ref_tag(rq);
+	struct bio *bio;
+
+	if (protection_type == T10_PI_TYPE3_PROTECTION)
+		return;
+
+	__rq_for_each_bio(bio, rq) {
+		struct bio_integrity_payload *bip = bio_integrity(bio);
+		u32 virt = bip_get_seed(bip) & 0xffffffff;
+		struct bio_vec iv;
+		struct bvec_iter iter;
+
+		/* Already remapped? */
+		if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+			break;
+
+		bip_for_each_vec(iv, bip, iter) {
+			void *p, *pmap;
+			unsigned int j;
+
+			pmap = kmap_atomic(iv.bv_page);
+			p = pmap + iv.bv_offset;
+			for (j = 0; j < iv.bv_len; j += tuple_sz) {
+				struct t10_pi_tuple *pi = p;
+
+				if (be32_to_cpu(pi->ref_tag) == virt)
+					pi->ref_tag = cpu_to_be32(ref_tag);
+				virt++;
+				ref_tag++;
+				p += tuple_sz;
+			}
+
+			kunmap_atomic(pmap);
+		}
+
+		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+	}
+}
+EXPORT_SYMBOL(t10_pi_prepare);
+
+/**
+ * t10_pi_complete - prepare PI prior returning request to the block layer
+ * @rq:              request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ * @intervals:       total elements to prepare
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Since the physical start sector was submitted
+ * to the device, we should remap it back to virtual values expected by the
+ * block layer.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_complete(struct request *rq, u8 protection_type,
+		     unsigned int intervals)
+{
+	const int tuple_sz = rq->q->integrity.tuple_size;
+	u32 ref_tag = t10_pi_ref_tag(rq);
+	struct bio *bio;
+
+	if (protection_type == T10_PI_TYPE3_PROTECTION)
+		return;
+
+	__rq_for_each_bio(bio, rq) {
+		struct bio_integrity_payload *bip = bio_integrity(bio);
+		u32 virt = bip_get_seed(bip) & 0xffffffff;
+		struct bio_vec iv;
+		struct bvec_iter iter;
+
+		bip_for_each_vec(iv, bip, iter) {
+			void *p, *pmap;
+			unsigned int j;
+
+			pmap = kmap_atomic(iv.bv_page);
+			p = pmap + iv.bv_offset;
+			for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+				struct t10_pi_tuple *pi = p;
+
+				if (be32_to_cpu(pi->ref_tag) == ref_tag)
+					pi->ref_tag = cpu_to_be32(virt);
+				virt++;
+				ref_tag++;
+				intervals--;
+				p += tuple_sz;
+			}
+
+			kunmap_atomic(pmap);
+		}
+	}
+}
+EXPORT_SYMBOL(t10_pi_complete);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9421d9877730..bbebdc3769b0 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		SCpnt->cmnd[0] = WRITE_6;
 
 		if (blk_integrity_rq(rq))
-			sd_dif_prepare(SCpnt);
+			t10_pi_prepare(SCpnt->request, sdkp->protection_type);
 
 	} else if (rq_data_dir(rq) == READ) {
 		SCpnt->cmnd[0] = READ_6;
@@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 					   "sd_done: completed %d of %d bytes\n",
 					   good_bytes, scsi_bufflen(SCpnt)));
 
-	if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))
-		sd_dif_complete(SCpnt, good_bytes);
+	if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) &&
+	    good_bytes)
+		t10_pi_complete(SCpnt->request, sdkp->protection_type,
+				good_bytes / scsi_prot_interval(SCpnt));
 
 	return good_bytes;
 }
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 392c7d078ae3..a7d4f50b67d4 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op)
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 
 extern void sd_dif_config_host(struct scsi_disk *);
-extern void sd_dif_prepare(struct scsi_cmnd *scmd);
-extern void sd_dif_complete(struct scsi_cmnd *, unsigned int);
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 static inline void sd_dif_config_host(struct scsi_disk *disk)
 {
 }
-static inline int sd_dif_prepare(struct scsi_cmnd *scmd)
-{
-	return 0;
-}
-static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a)
-{
-}
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index d8de43d359ac..db72c82486e3 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -95,116 +95,3 @@ out:
 	blk_integrity_register(disk, &bi);
 }
 
-/*
- * The virtual start sector is the one that was originally submitted
- * by the block layer.	Due to partitioning, MD/DM cloning, etc. the
- * actual physical start sector is likely to be different.  Remap
- * protection information to match the physical LBA.
- *
- * From a protocol perspective there's a slight difference between
- * Type 1 and 2.  The latter uses 32-byte CDBs exclusively, and the
- * reference tag is seeded in the CDB.  This gives us the potential to
- * avoid virt->phys remapping during write.  However, at read time we
- * don't know whether the virt sector is the same as when we wrote it
- * (we could be reading from real disk as opposed to MD/DM device.  So
- * we always remap Type 2 making it identical to Type 1.
- *
- * Type 3 does not have a reference tag so no remapping is required.
- */
-void sd_dif_prepare(struct scsi_cmnd *scmd)
-{
-	const int tuple_sz = sizeof(struct t10_pi_tuple);
-	struct bio *bio;
-	struct scsi_disk *sdkp;
-	struct t10_pi_tuple *pi;
-	u32 phys, virt;
-
-	sdkp = scsi_disk(scmd->request->rq_disk);
-
-	if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
-		return;
-
-	phys = t10_pi_ref_tag(scmd->request);
-
-	__rq_for_each_bio(bio, scmd->request) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		struct bio_vec iv;
-		struct bvec_iter iter;
-		unsigned int j;
-
-		/* Already remapped? */
-		if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
-			break;
-
-		virt = bip_get_seed(bip) & 0xffffffff;
-
-		bip_for_each_vec(iv, bip, iter) {
-			pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
-			for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
-				if (be32_to_cpu(pi->ref_tag) == virt)
-					pi->ref_tag = cpu_to_be32(phys);
-
-				virt++;
-				phys++;
-			}
-
-			kunmap_atomic(pi);
-		}
-
-		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
-	}
-}
-
-/*
- * Remap physical sector values in the reference tag to the virtual
- * values expected by the block layer.
- */
-void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
-{
-	const int tuple_sz = sizeof(struct t10_pi_tuple);
-	struct scsi_disk *sdkp;
-	struct bio *bio;
-	struct t10_pi_tuple *pi;
-	unsigned int j, intervals;
-	u32 phys, virt;
-
-	sdkp = scsi_disk(scmd->request->rq_disk);
-
-	if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0)
-		return;
-
-	intervals = good_bytes / scsi_prot_interval(scmd);
-	phys = t10_pi_ref_tag(scmd->request);
-
-	__rq_for_each_bio(bio, scmd->request) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		struct bio_vec iv;
-		struct bvec_iter iter;
-
-		virt = bip_get_seed(bip) & 0xffffffff;
-
-		bip_for_each_vec(iv, bip, iter) {
-			pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
-			for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
-				if (intervals == 0) {
-					kunmap_atomic(pi);
-					return;
-				}
-
-				if (be32_to_cpu(pi->ref_tag) == phys)
-					pi->ref_tag = cpu_to_be32(virt);
-
-				virt++;
-				phys++;
-				intervals--;
-			}
-
-			kunmap_atomic(pi);
-		}
-	}
-}
-
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index c40511f4e63d..5a427c289f58 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -51,5 +51,8 @@ extern const struct blk_integrity_profile t10_pi_type1_crc;
 extern const struct blk_integrity_profile t10_pi_type1_ip;
 extern const struct blk_integrity_profile t10_pi_type3_crc;
 extern const struct blk_integrity_profile t10_pi_type3_ip;
+extern void t10_pi_prepare(struct request *rq, u8 protection_type);
+extern void t10_pi_complete(struct request *rq, u8 protection_type,
+			    unsigned int intervals);
 
 #endif
-- 
cgit v1.2.3


From 760c435e0f85ed19e48a90d746ce1de2cd02def7 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 Jul 2018 00:09:12 +0200
Subject: mtd: rawnand: make subop helpers return unsigned values

A report from Colin Ian King pointed a CoverityScan issue where error
values on these helpers where not checked in the drivers. These
helpers can error out only in case of a software bug in driver code,
not because of a runtime/hardware error. Hence, let's WARN_ON() in this
case and return 0 which is harmless anyway.

Fixes: 8878b126df76 ("mtd: nand: add ->exec_op() implementation")
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 44 ++++++++++++++++++++--------------------
 include/linux/mtd/rawnand.h      | 16 +++++++--------
 2 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index c4b74630f4c5..ef10beab99f5 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -2668,8 +2668,8 @@ static bool nand_subop_instr_is_valid(const struct nand_subop *subop,
 	return subop && instr_idx < subop->ninstrs;
 }
 
-static int nand_subop_get_start_off(const struct nand_subop *subop,
-				    unsigned int instr_idx)
+static unsigned int nand_subop_get_start_off(const struct nand_subop *subop,
+					     unsigned int instr_idx)
 {
 	if (instr_idx)
 		return 0;
@@ -2688,12 +2688,12 @@ static int nand_subop_get_start_off(const struct nand_subop *subop,
  *
  * Given an address instruction, returns the offset of the first cycle to issue.
  */
-int nand_subop_get_addr_start_off(const struct nand_subop *subop,
-				  unsigned int instr_idx)
+unsigned int nand_subop_get_addr_start_off(const struct nand_subop *subop,
+					   unsigned int instr_idx)
 {
-	if (!nand_subop_instr_is_valid(subop, instr_idx) ||
-	    subop->instrs[instr_idx].type != NAND_OP_ADDR_INSTR)
-		return -EINVAL;
+	if (WARN_ON(!nand_subop_instr_is_valid(subop, instr_idx) ||
+		    subop->instrs[instr_idx].type != NAND_OP_ADDR_INSTR))
+		return 0;
 
 	return nand_subop_get_start_off(subop, instr_idx);
 }
@@ -2710,14 +2710,14 @@ EXPORT_SYMBOL_GPL(nand_subop_get_addr_start_off);
  *
  * Given an address instruction, returns the number of address cycle to issue.
  */
-int nand_subop_get_num_addr_cyc(const struct nand_subop *subop,
-				unsigned int instr_idx)
+unsigned int nand_subop_get_num_addr_cyc(const struct nand_subop *subop,
+					 unsigned int instr_idx)
 {
 	int start_off, end_off;
 
-	if (!nand_subop_instr_is_valid(subop, instr_idx) ||
-	    subop->instrs[instr_idx].type != NAND_OP_ADDR_INSTR)
-		return -EINVAL;
+	if (WARN_ON(!nand_subop_instr_is_valid(subop, instr_idx) ||
+		    subop->instrs[instr_idx].type != NAND_OP_ADDR_INSTR))
+		return 0;
 
 	start_off = nand_subop_get_addr_start_off(subop, instr_idx);
 
@@ -2742,12 +2742,12 @@ EXPORT_SYMBOL_GPL(nand_subop_get_num_addr_cyc);
  *
  * Given a data instruction, returns the offset to start from.
  */
-int nand_subop_get_data_start_off(const struct nand_subop *subop,
-				  unsigned int instr_idx)
+unsigned int nand_subop_get_data_start_off(const struct nand_subop *subop,
+					   unsigned int instr_idx)
 {
-	if (!nand_subop_instr_is_valid(subop, instr_idx) ||
-	    !nand_instr_is_data(&subop->instrs[instr_idx]))
-		return -EINVAL;
+	if (WARN_ON(!nand_subop_instr_is_valid(subop, instr_idx) ||
+		    !nand_instr_is_data(&subop->instrs[instr_idx])))
+		return 0;
 
 	return nand_subop_get_start_off(subop, instr_idx);
 }
@@ -2764,14 +2764,14 @@ EXPORT_SYMBOL_GPL(nand_subop_get_data_start_off);
  *
  * Returns the length of the chunk of data to send/receive.
  */
-int nand_subop_get_data_len(const struct nand_subop *subop,
-			    unsigned int instr_idx)
+unsigned int nand_subop_get_data_len(const struct nand_subop *subop,
+				     unsigned int instr_idx)
 {
 	int start_off = 0, end_off;
 
-	if (!nand_subop_instr_is_valid(subop, instr_idx) ||
-	    !nand_instr_is_data(&subop->instrs[instr_idx]))
-		return -EINVAL;
+	if (WARN_ON(!nand_subop_instr_is_valid(subop, instr_idx) ||
+		    !nand_instr_is_data(&subop->instrs[instr_idx])))
+		return 0;
 
 	start_off = nand_subop_get_data_start_off(subop, instr_idx);
 
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index f60fad29eae6..598d356de83f 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1007,14 +1007,14 @@ struct nand_subop {
 	unsigned int last_instr_end_off;
 };
 
-int nand_subop_get_addr_start_off(const struct nand_subop *subop,
-				  unsigned int op_id);
-int nand_subop_get_num_addr_cyc(const struct nand_subop *subop,
-				unsigned int op_id);
-int nand_subop_get_data_start_off(const struct nand_subop *subop,
-				  unsigned int op_id);
-int nand_subop_get_data_len(const struct nand_subop *subop,
-			    unsigned int op_id);
+unsigned int nand_subop_get_addr_start_off(const struct nand_subop *subop,
+					   unsigned int op_id);
+unsigned int nand_subop_get_num_addr_cyc(const struct nand_subop *subop,
+					 unsigned int op_id);
+unsigned int nand_subop_get_data_start_off(const struct nand_subop *subop,
+					   unsigned int op_id);
+unsigned int nand_subop_get_data_len(const struct nand_subop *subop,
+				     unsigned int op_id);
 
 /**
  * struct nand_op_parser_addr_constraints - Constraints for address instructions
-- 
cgit v1.2.3


From 7da45139d264f3b7ead04e00ebb29b189cf9826e Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 17 Jul 2018 09:08:02 +0200
Subject: mtd: rawnand: better name for the controller structure

In the raw NAND core, a NAND chip is described by a nand_chip structure,
while a NAND controller is described with a nand_hw_control structure
which is not very meaningful.

Rename this structure nand_controller.

As the structure gets renamed, it is logical to also rename the core
function initializing it from nand_hw_control_init() to
nand_controller_init().

Lastly, the 'hwcontrol' entry of the nand_chip structure is not
meaningful neither while it has the role of fallback when no controller
structure is provided by the driver (the controller driver is dumb and
can only control a single chip). Thus, it is renamed dummy_controller.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/atmel/nand-controller.c | 10 +++++-----
 drivers/mtd/nand/raw/brcmnand/brcmnand.c     |  4 ++--
 drivers/mtd/nand/raw/docg4.c                 |  4 ++--
 drivers/mtd/nand/raw/fsl_elbc_nand.c         |  4 ++--
 drivers/mtd/nand/raw/fsl_ifc_nand.c          |  4 ++--
 drivers/mtd/nand/raw/jz4780_nand.c           |  7 ++++---
 drivers/mtd/nand/raw/marvell_nand.c          |  6 +++---
 drivers/mtd/nand/raw/mtk_nand.c              |  2 +-
 drivers/mtd/nand/raw/nand_base.c             |  4 ++--
 drivers/mtd/nand/raw/ndfc.c                  |  4 ++--
 drivers/mtd/nand/raw/omap2.c                 |  2 +-
 drivers/mtd/nand/raw/oxnas_nand.c            |  4 ++--
 drivers/mtd/nand/raw/qcom_nandc.c            |  4 ++--
 drivers/mtd/nand/raw/s3c2410.c               |  4 ++--
 drivers/mtd/nand/raw/sunxi_nand.c            |  6 +++---
 drivers/mtd/nand/raw/tango_nand.c            |  4 ++--
 drivers/mtd/nand/raw/tegra_nand.c            |  6 +++---
 drivers/mtd/nand/raw/txx9ndfmc.c             |  4 ++--
 include/linux/mtd/rawnand.h                  | 14 ++++++++------
 19 files changed, 50 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c
index 30dae4c9d439..855cc7729c43 100644
--- a/drivers/mtd/nand/raw/atmel/nand-controller.c
+++ b/drivers/mtd/nand/raw/atmel/nand-controller.c
@@ -216,7 +216,7 @@ struct atmel_nand_controller_caps {
 };
 
 struct atmel_nand_controller {
-	struct nand_hw_control base;
+	struct nand_controller base;
 	const struct atmel_nand_controller_caps *caps;
 	struct device *dev;
 	struct regmap *smc;
@@ -227,7 +227,7 @@ struct atmel_nand_controller {
 };
 
 static inline struct atmel_nand_controller *
-to_nand_controller(struct nand_hw_control *ctl)
+to_nand_controller(struct nand_controller *ctl)
 {
 	return container_of(ctl, struct atmel_nand_controller, base);
 }
@@ -239,7 +239,7 @@ struct atmel_smc_nand_controller {
 };
 
 static inline struct atmel_smc_nand_controller *
-to_smc_nand_controller(struct nand_hw_control *ctl)
+to_smc_nand_controller(struct nand_controller *ctl)
 {
 	return container_of(to_nand_controller(ctl),
 			    struct atmel_smc_nand_controller, base);
@@ -263,7 +263,7 @@ struct atmel_hsmc_nand_controller {
 };
 
 static inline struct atmel_hsmc_nand_controller *
-to_hsmc_nand_controller(struct nand_hw_control *ctl)
+to_hsmc_nand_controller(struct nand_controller *ctl)
 {
 	return container_of(to_nand_controller(ctl),
 			    struct atmel_hsmc_nand_controller, base);
@@ -1966,7 +1966,7 @@ static int atmel_nand_controller_init(struct atmel_nand_controller *nc,
 	struct device_node *np = dev->of_node;
 	int ret;
 
-	nand_hw_control_init(&nc->base);
+	nand_controller_init(&nc->base);
 	INIT_LIST_HEAD(&nc->chips);
 	nc->dev = dev;
 	nc->caps = caps;
diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
index 1306aaa7a8bf..2e5efa0f9ea2 100644
--- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
@@ -114,7 +114,7 @@ enum {
 
 struct brcmnand_controller {
 	struct device		*dev;
-	struct nand_hw_control	controller;
+	struct nand_controller	controller;
 	void __iomem		*nand_base;
 	void __iomem		*nand_fc; /* flash cache */
 	void __iomem		*flash_dma_base;
@@ -2433,7 +2433,7 @@ int brcmnand_probe(struct platform_device *pdev, struct brcmnand_soc *soc)
 
 	init_completion(&ctrl->done);
 	init_completion(&ctrl->dma_done);
-	nand_hw_control_init(&ctrl->controller);
+	nand_controller_init(&ctrl->controller);
 	INIT_LIST_HEAD(&ctrl->host_list);
 
 	/* NAND register range */
diff --git a/drivers/mtd/nand/raw/docg4.c b/drivers/mtd/nand/raw/docg4.c
index bb96cb33cd6b..4dccdfba6140 100644
--- a/drivers/mtd/nand/raw/docg4.c
+++ b/drivers/mtd/nand/raw/docg4.c
@@ -1257,8 +1257,8 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
 	nand->ecc.strength = DOCG4_T;
 	nand->options = NAND_BUSWIDTH_16 | NAND_NO_SUBPAGE_WRITE;
 	nand->IO_ADDR_R = nand->IO_ADDR_W = doc->virtadr + DOC_IOSPACE_DATA;
-	nand->controller = &nand->hwcontrol;
-	nand_hw_control_init(nand->controller);
+	nand->controller = &nand->dummy_controller;
+	nand_controller_init(nand->controller);
 
 	/* methods */
 	nand->cmdfunc = docg4_command;
diff --git a/drivers/mtd/nand/raw/fsl_elbc_nand.c b/drivers/mtd/nand/raw/fsl_elbc_nand.c
index 51f0b340bc0d..0aa54a949653 100644
--- a/drivers/mtd/nand/raw/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_elbc_nand.c
@@ -61,7 +61,7 @@ struct fsl_elbc_mtd {
 /* Freescale eLBC FCM controller information */
 
 struct fsl_elbc_fcm_ctrl {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct fsl_elbc_mtd *chips[MAX_BANKS];
 
 	u8 __iomem *addr;        /* Address of assigned FCM buffer        */
@@ -879,7 +879,7 @@ static int fsl_elbc_nand_probe(struct platform_device *pdev)
 		}
 		elbc_fcm_ctrl->counter++;
 
-		nand_hw_control_init(&elbc_fcm_ctrl->controller);
+		nand_controller_init(&elbc_fcm_ctrl->controller);
 		fsl_lbc_ctrl_dev->nand = elbc_fcm_ctrl;
 	} else {
 		elbc_fcm_ctrl = fsl_lbc_ctrl_dev->nand;
diff --git a/drivers/mtd/nand/raw/fsl_ifc_nand.c b/drivers/mtd/nand/raw/fsl_ifc_nand.c
index 75d3c951f61a..96130d91e32c 100644
--- a/drivers/mtd/nand/raw/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/raw/fsl_ifc_nand.c
@@ -51,7 +51,7 @@ struct fsl_ifc_mtd {
 
 /* overview of the fsl ifc controller */
 struct fsl_ifc_nand_ctrl {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct fsl_ifc_mtd *chips[FSL_IFC_BANK_COUNT];
 
 	void __iomem *addr;	/* Address of assigned IFC buffer	*/
@@ -1004,7 +1004,7 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 		ifc_nand_ctrl->addr = NULL;
 		fsl_ifc_ctrl_dev->nand = ifc_nand_ctrl;
 
-		nand_hw_control_init(&ifc_nand_ctrl->controller);
+		nand_controller_init(&ifc_nand_ctrl->controller);
 	} else {
 		ifc_nand_ctrl = fsl_ifc_ctrl_dev->nand;
 	}
diff --git a/drivers/mtd/nand/raw/jz4780_nand.c b/drivers/mtd/nand/raw/jz4780_nand.c
index e69f6ae4c539..49841dad347c 100644
--- a/drivers/mtd/nand/raw/jz4780_nand.c
+++ b/drivers/mtd/nand/raw/jz4780_nand.c
@@ -44,7 +44,7 @@ struct jz4780_nand_cs {
 struct jz4780_nand_controller {
 	struct device *dev;
 	struct jz4780_bch *bch;
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	unsigned int num_banks;
 	struct list_head chips;
 	int selected;
@@ -65,7 +65,8 @@ static inline struct jz4780_nand_chip *to_jz4780_nand_chip(struct mtd_info *mtd)
 	return container_of(mtd_to_nand(mtd), struct jz4780_nand_chip, chip);
 }
 
-static inline struct jz4780_nand_controller *to_jz4780_nand_controller(struct nand_hw_control *ctrl)
+static inline struct jz4780_nand_controller
+*to_jz4780_nand_controller(struct nand_controller *ctrl)
 {
 	return container_of(ctrl, struct jz4780_nand_controller, controller);
 }
@@ -368,7 +369,7 @@ static int jz4780_nand_probe(struct platform_device *pdev)
 	nfc->dev = dev;
 	nfc->num_banks = num_banks;
 
-	nand_hw_control_init(&nfc->controller);
+	nand_controller_init(&nfc->controller);
 	INIT_LIST_HEAD(&nfc->chips);
 
 	ret = jz4780_nand_init_chips(nfc, pdev);
diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c
index 80a074cccb82..bd5f9a4b7b16 100644
--- a/drivers/mtd/nand/raw/marvell_nand.c
+++ b/drivers/mtd/nand/raw/marvell_nand.c
@@ -318,7 +318,7 @@ struct marvell_nfc_caps {
  * @dma_buf:		32-bit aligned buffer for DMA transfers (NFCv1 only)
  */
 struct marvell_nfc {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct device *dev;
 	void __iomem *regs;
 	struct clk *core_clk;
@@ -335,7 +335,7 @@ struct marvell_nfc {
 	u8 *dma_buf;
 };
 
-static inline struct marvell_nfc *to_marvell_nfc(struct nand_hw_control *ctrl)
+static inline struct marvell_nfc *to_marvell_nfc(struct nand_controller *ctrl)
 {
 	return container_of(ctrl, struct marvell_nfc, controller);
 }
@@ -2745,7 +2745,7 @@ static int marvell_nfc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	nfc->dev = dev;
-	nand_hw_control_init(&nfc->controller);
+	nand_controller_init(&nfc->controller);
 	INIT_LIST_HEAD(&nfc->chips);
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
diff --git a/drivers/mtd/nand/raw/mtk_nand.c b/drivers/mtd/nand/raw/mtk_nand.c
index e6b14b79c8a8..7bc6be3f6ec0 100644
--- a/drivers/mtd/nand/raw/mtk_nand.c
+++ b/drivers/mtd/nand/raw/mtk_nand.c
@@ -145,7 +145,7 @@ struct mtk_nfc_clk {
 };
 
 struct mtk_nfc {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct mtk_ecc_config ecc_cfg;
 	struct mtk_nfc_clk clk;
 	struct mtk_ecc *ecc;
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index e545e03a214e..dcdf0f373100 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5000,8 +5000,8 @@ static void nand_set_defaults(struct nand_chip *chip)
 		chip->read_buf = busw ? nand_read_buf16 : nand_read_buf;
 
 	if (!chip->controller) {
-		chip->controller = &chip->hwcontrol;
-		nand_hw_control_init(chip->controller);
+		chip->controller = &chip->dummy_controller;
+		nand_controller_init(chip->controller);
 	}
 
 	if (!chip->buf_align)
diff --git a/drivers/mtd/nand/raw/ndfc.c b/drivers/mtd/nand/raw/ndfc.c
index d8a806894937..540fa1a0ea24 100644
--- a/drivers/mtd/nand/raw/ndfc.c
+++ b/drivers/mtd/nand/raw/ndfc.c
@@ -39,7 +39,7 @@ struct ndfc_controller {
 	void __iomem *ndfcbase;
 	struct nand_chip chip;
 	int chip_select;
-	struct nand_hw_control ndfc_control;
+	struct nand_controller ndfc_control;
 };
 
 static struct ndfc_controller ndfc_ctrl[NDFC_MAX_CS];
@@ -218,7 +218,7 @@ static int ndfc_probe(struct platform_device *ofdev)
 	ndfc = &ndfc_ctrl[cs];
 	ndfc->chip_select = cs;
 
-	nand_hw_control_init(&ndfc->ndfc_control);
+	nand_controller_init(&ndfc->ndfc_control);
 	ndfc->ofdev = ofdev;
 	dev_set_drvdata(&ofdev->dev, ndfc);
 
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index e50c64adc3c8..e943b2e5a5e2 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -145,7 +145,7 @@ static u_char bch8_vector[] = {0xf3, 0xdb, 0x14, 0x16, 0x8b, 0xd2, 0xbe, 0xcc,
 static u_char bch4_vector[] = {0x00, 0x6b, 0x31, 0xdd, 0x41, 0xbc, 0x10};
 
 /* Shared among all NAND instances to synchronize access to the ECC Engine */
-static struct nand_hw_control omap_gpmc_controller = {
+static struct nand_controller omap_gpmc_controller = {
 	.lock = __SPIN_LOCK_UNLOCKED(omap_gpmc_controller.lock),
 	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(omap_gpmc_controller.wq),
 };
diff --git a/drivers/mtd/nand/raw/oxnas_nand.c b/drivers/mtd/nand/raw/oxnas_nand.c
index d649d5944826..01b00bb69c1e 100644
--- a/drivers/mtd/nand/raw/oxnas_nand.c
+++ b/drivers/mtd/nand/raw/oxnas_nand.c
@@ -32,7 +32,7 @@
 #define OXNAS_NAND_MAX_CHIPS	1
 
 struct oxnas_nand_ctrl {
-	struct nand_hw_control base;
+	struct nand_controller base;
 	void __iomem *io_base;
 	struct clk *clk;
 	struct nand_chip *chips[OXNAS_NAND_MAX_CHIPS];
@@ -96,7 +96,7 @@ static int oxnas_nand_probe(struct platform_device *pdev)
 	if (!oxnas)
 		return -ENOMEM;
 
-	nand_hw_control_init(&oxnas->base);
+	nand_controller_init(&oxnas->base);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	oxnas->io_base = devm_ioremap_resource(&pdev->dev, res);
diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c
index 645630953f38..aa6c3e026ef1 100644
--- a/drivers/mtd/nand/raw/qcom_nandc.c
+++ b/drivers/mtd/nand/raw/qcom_nandc.c
@@ -365,7 +365,7 @@ struct nandc_regs {
  *				from all connected NAND devices pagesize
  */
 struct qcom_nand_controller {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct list_head host_list;
 
 	struct device *dev;
@@ -2728,7 +2728,7 @@ static int qcom_nandc_alloc(struct qcom_nand_controller *nandc)
 	INIT_LIST_HEAD(&nandc->desc_list);
 	INIT_LIST_HEAD(&nandc->host_list);
 
-	nand_hw_control_init(&nandc->controller);
+	nand_controller_init(&nandc->controller);
 
 	return 0;
 }
diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c
index 5a4a68790653..e8bf64832213 100644
--- a/drivers/mtd/nand/raw/s3c2410.c
+++ b/drivers/mtd/nand/raw/s3c2410.c
@@ -162,7 +162,7 @@ enum s3c_nand_clk_state {
  */
 struct s3c2410_nand_info {
 	/* mtd info */
-	struct nand_hw_control		controller;
+	struct nand_controller		controller;
 	struct s3c2410_nand_mtd		*mtds;
 	struct s3c2410_platform_nand	*platform;
 
@@ -1094,7 +1094,7 @@ static int s3c24xx_nand_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, info);
 
-	nand_hw_control_init(&info->controller);
+	nand_controller_init(&info->controller);
 
 	/* get the clock source and enable it */
 
diff --git a/drivers/mtd/nand/raw/sunxi_nand.c b/drivers/mtd/nand/raw/sunxi_nand.c
index 4b11cd4a79be..07f3ff9a28f2 100644
--- a/drivers/mtd/nand/raw/sunxi_nand.c
+++ b/drivers/mtd/nand/raw/sunxi_nand.c
@@ -234,7 +234,7 @@ static inline struct sunxi_nand_chip *to_sunxi_nand(struct nand_chip *nand)
  *			controller events
  */
 struct sunxi_nfc {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct device *dev;
 	void __iomem *regs;
 	struct clk *ahb_clk;
@@ -247,7 +247,7 @@ struct sunxi_nfc {
 	struct dma_chan *dmac;
 };
 
-static inline struct sunxi_nfc *to_sunxi_nfc(struct nand_hw_control *ctrl)
+static inline struct sunxi_nfc *to_sunxi_nfc(struct nand_controller *ctrl)
 {
 	return container_of(ctrl, struct sunxi_nfc, controller);
 }
@@ -2012,7 +2012,7 @@ static int sunxi_nfc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	nfc->dev = dev;
-	nand_hw_control_init(&nfc->controller);
+	nand_controller_init(&nfc->controller);
 	INIT_LIST_HEAD(&nfc->chips);
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
diff --git a/drivers/mtd/nand/raw/tango_nand.c b/drivers/mtd/nand/raw/tango_nand.c
index f2052fae21c7..dd7a26efdf4f 100644
--- a/drivers/mtd/nand/raw/tango_nand.c
+++ b/drivers/mtd/nand/raw/tango_nand.c
@@ -83,7 +83,7 @@
 #define MAX_CS		4
 
 struct tango_nfc {
-	struct nand_hw_control hw;
+	struct nand_controller hw;
 	void __iomem *reg_base;
 	void __iomem *mem_base;
 	void __iomem *pbus_base;
@@ -654,7 +654,7 @@ static int tango_nand_probe(struct platform_device *pdev)
 		return PTR_ERR(nfc->chan);
 
 	platform_set_drvdata(pdev, nfc);
-	nand_hw_control_init(&nfc->hw);
+	nand_controller_init(&nfc->hw);
 	nfc->freq_kHz = clk_get_rate(clk) / 1000;
 
 	for_each_child_of_node(pdev->dev.of_node, np) {
diff --git a/drivers/mtd/nand/raw/tegra_nand.c b/drivers/mtd/nand/raw/tegra_nand.c
index 56c0aa1bc81f..31c0d9ca9d23 100644
--- a/drivers/mtd/nand/raw/tegra_nand.c
+++ b/drivers/mtd/nand/raw/tegra_nand.c
@@ -164,7 +164,7 @@
 				HWSTATUS_RBSY_VALUE(NAND_STATUS_READY))
 
 struct tegra_nand_controller {
-	struct nand_hw_control controller;
+	struct nand_controller controller;
 	struct device *dev;
 	void __iomem *regs;
 	int irq;
@@ -187,7 +187,7 @@ struct tegra_nand_chip {
 };
 
 static inline struct tegra_nand_controller *
-			to_tegra_ctrl(struct nand_hw_control *hw_ctrl)
+			to_tegra_ctrl(struct nand_controller *hw_ctrl)
 {
 	return container_of(hw_ctrl, struct tegra_nand_controller, controller);
 }
@@ -1136,7 +1136,7 @@ static int tegra_nand_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ctrl->dev = &pdev->dev;
-	nand_hw_control_init(&ctrl->controller);
+	nand_controller_init(&ctrl->controller);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	ctrl->regs = devm_ioremap_resource(&pdev->dev, res);
diff --git a/drivers/mtd/nand/raw/txx9ndfmc.c b/drivers/mtd/nand/raw/txx9ndfmc.c
index 5fe9da8b4a0a..8f5bbbac4612 100644
--- a/drivers/mtd/nand/raw/txx9ndfmc.c
+++ b/drivers/mtd/nand/raw/txx9ndfmc.c
@@ -73,7 +73,7 @@ struct txx9ndfmc_drvdata {
 	void __iomem *base;
 	unsigned char hold;	/* in gbusclock */
 	unsigned char spw;	/* in gbusclock */
-	struct nand_hw_control hw_control;
+	struct nand_controller hw_control;
 };
 
 static struct platform_device *mtd_to_platdev(struct mtd_info *mtd)
@@ -303,7 +303,7 @@ static int __init txx9ndfmc_probe(struct platform_device *dev)
 	dev_info(&dev->dev, "CLK:%ldMHz HOLD:%d SPW:%d\n",
 		 (gbusclk + 500000) / 1000000, hold, spw);
 
-	nand_hw_control_init(&drvdata->hw_control);
+	nand_controller_init(&drvdata->hw_control);
 
 	platform_set_drvdata(dev, drvdata);
 	txx9ndfmc_initialize(dev);
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 598d356de83f..93a2678e0f0d 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -510,20 +510,21 @@ struct nand_id {
 };
 
 /**
- * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
+ * struct nand_controller - Structure used to describe a NAND controller
+ *
  * @lock:               protection lock
  * @active:		the mtd device which holds the controller currently
  * @wq:			wait queue to sleep on if a NAND operation is in
  *			progress used instead of the per chip wait queue
  *			when a hw controller is available.
  */
-struct nand_hw_control {
+struct nand_controller {
 	spinlock_t lock;
 	struct nand_chip *active;
 	wait_queue_head_t wq;
 };
 
-static inline void nand_hw_control_init(struct nand_hw_control *nfc)
+static inline void nand_controller_init(struct nand_controller *nfc)
 {
 	nfc->active = NULL;
 	spin_lock_init(&nfc->lock);
@@ -1197,7 +1198,8 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
  *			setting the read-retry mode. Mostly needed for MLC NAND.
  * @ecc:		[BOARDSPECIFIC] ECC control structure
  * @buf_align:		minimum buffer alignment required by a platform
- * @hwcontrol:		platform-specific hardware control structure
+ * @dummy_controller:	dummy controller implementation for drivers that can
+ *			only control a single chip
  * @erase:		[REPLACEABLE] erase function
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transferring
  *			data from array to read regs (tR).
@@ -1333,11 +1335,11 @@ struct nand_chip {
 	flstate_t state;
 
 	uint8_t *oob_poi;
-	struct nand_hw_control *controller;
+	struct nand_controller *controller;
 
 	struct nand_ecc_ctrl ecc;
 	unsigned long buf_align;
-	struct nand_hw_control hwcontrol;
+	struct nand_controller dummy_controller;
 
 	uint8_t *bbt;
 	struct nand_bbt_descr *bbt_td;
-- 
cgit v1.2.3


From 05b54c7bac906c443fd0b23ab8954e0560b33e5c Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 Jul 2018 01:05:46 +0200
Subject: mtd: rawnand: add hooks that may be called during nand_scan()

In order to remove the limitation that forbids dynamic allocation in
nand_scan_ident(), we must create a path that will be the same for all
controller drivers. The idea is to use nand_scan() instead of the widely
used nand_scan_ident()/nand_scan_tail() couple. In order to achieve
this, controller drivers will need to adjust some parameters between
these two functions depending on the NAND chip wired on them.

This takes the form of two new hooks (->{attach,detach}_chip()) that are
placed in a new nand_controller_ops structure, which is then attached
to the nand_controller object at driver initialization time.
->attach_chip() is called between nand_scan_ident() and
nand_scan_tail(), and ->detach_chip() is called in the error path of
nand_scan() and in nand_cleanup().

Note that some NAND controller drivers don't have a dedicated
nand_controller object and instead rely on the default/dummy one
embedded in nand_chip. If you're in this case and still want to
initialize the controller ops, you'll have to manipulate
chip->dummy_controller directly.

Last but not least, it's worth mentioning that we plan to move some of
the controller related hooks placed in nand_chip into
nand_controller_ops to make the separation between NAND chip and NAND
controller methods clearer.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 32 ++++++++++++++++++++++++++++++--
 include/linux/mtd/rawnand.h      | 21 +++++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index dcdf0f373100..dea41fa25be1 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -6718,6 +6718,20 @@ err_free_buf:
 }
 EXPORT_SYMBOL(nand_scan_tail);
 
+static int nand_attach(struct nand_chip *chip)
+{
+	if (chip->controller->ops && chip->controller->ops->attach_chip)
+		return chip->controller->ops->attach_chip(chip);
+
+	return 0;
+}
+
+static void nand_detach(struct nand_chip *chip)
+{
+	if (chip->controller->ops && chip->controller->ops->detach_chip)
+		chip->controller->ops->detach_chip(chip);
+}
+
 /**
  * nand_scan_with_ids - [NAND Interface] Scan for the NAND device
  * @mtd: MTD device structure
@@ -6731,11 +6745,21 @@ EXPORT_SYMBOL(nand_scan_tail);
 int nand_scan_with_ids(struct mtd_info *mtd, int maxchips,
 		       struct nand_flash_dev *ids)
 {
+	struct nand_chip *chip = mtd_to_nand(mtd);
 	int ret;
 
 	ret = nand_scan_ident(mtd, maxchips, ids);
-	if (!ret)
-		ret = nand_scan_tail(mtd);
+	if (ret)
+		return ret;
+
+	ret = nand_attach(chip);
+	if (ret)
+		return ret;
+
+	ret = nand_scan_tail(mtd);
+	if (ret)
+		nand_detach(chip);
+
 	return ret;
 }
 EXPORT_SYMBOL(nand_scan_with_ids);
@@ -6763,7 +6787,11 @@ void nand_cleanup(struct nand_chip *chip)
 
 	/* Free manufacturer priv data. */
 	nand_manufacturer_cleanup(chip);
+
+	/* Free controller specific allocations after chip identification */
+	nand_detach(chip);
 }
+
 EXPORT_SYMBOL_GPL(nand_cleanup);
 
 /**
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 93a2678e0f0d..fbd6b29cf22c 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -509,6 +509,25 @@ struct nand_id {
 	int len;
 };
 
+/**
+ * struct nand_controller_ops - Controller operations
+ *
+ * @attach_chip: this method is called after the NAND detection phase after
+ *		 flash ID and MTD fields such as erase size, page size and OOB
+ *		 size have been set up. ECC requirements are available if
+ *		 provided by the NAND chip or device tree. Typically used to
+ *		 choose the appropriate ECC configuration and allocate
+ *		 associated resources.
+ *		 This hook is optional.
+ * @detach_chip: free all resources allocated/claimed in
+ *		 nand_controller_ops->attach_chip().
+ *		 This hook is optional.
+ */
+struct nand_controller_ops {
+	int (*attach_chip)(struct nand_chip *chip);
+	void (*detach_chip)(struct nand_chip *chip);
+};
+
 /**
  * struct nand_controller - Structure used to describe a NAND controller
  *
@@ -517,11 +536,13 @@ struct nand_id {
  * @wq:			wait queue to sleep on if a NAND operation is in
  *			progress used instead of the per chip wait queue
  *			when a hw controller is available.
+ * @ops:		NAND controller operations.
  */
 struct nand_controller {
 	spinlock_t lock;
 	struct nand_chip *active;
 	wait_queue_head_t wq;
+	const struct nand_controller_ops *ops;
 };
 
 static inline void nand_controller_init(struct nand_controller *nfc)
-- 
cgit v1.2.3


From 98732da1a08ebb666983d469981a8b994e77d556 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 25 Jul 2018 15:31:50 +0200
Subject: mtd: rawnand: do not export nand_scan_[ident|tail]() anymore

Both nand_scan_ident() and nand_scan_tail() helpers used to be called
directly from controller drivers that needed to tweak some ECC-related
parameters before nand_scan_tail(). This separation prevented dynamic
allocations during the phase of NAND identification, which was
inconvenient.

All controller drivers have been moved to use nand_scan(), in
conjunction with the chip->ecc.[attach|detach]_chip() hooks that
actually do the required tweaking sequence between both ident/tail
calls, allowing programmers to use dynamic allocation as they need all
across the scanning sequence.

Declare nand_scan_[ident|tail]() statically now.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 16 +++++++++-------
 include/linux/mtd/rawnand.h      | 18 ++++++------------
 2 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 42a7a934a17b..34ea44f90fd8 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5924,7 +5924,7 @@ static int nand_dt_init(struct nand_chip *chip)
 }
 
 /**
- * nand_scan_ident - [NAND Interface] Scan for the NAND device
+ * nand_scan_ident - Scan for the NAND device
  * @mtd: MTD device structure
  * @maxchips: number of chips to scan for
  * @table: alternative NAND ID table
@@ -5932,9 +5932,13 @@ static int nand_dt_init(struct nand_chip *chip)
  * This is the first phase of the normal nand_scan() function. It reads the
  * flash ID and sets up MTD fields accordingly.
  *
+ * This helper used to be called directly from controller drivers that needed
+ * to tweak some ECC-related parameters before nand_scan_tail(). This separation
+ * prevented dynamic allocations during this phase which was unconvenient and
+ * as been banned for the benefit of the ->init_ecc()/cleanup_ecc() hooks.
  */
-int nand_scan_ident(struct mtd_info *mtd, int maxchips,
-		    struct nand_flash_dev *table)
+static int nand_scan_ident(struct mtd_info *mtd, int maxchips,
+			   struct nand_flash_dev *table)
 {
 	int i, nand_maf_id, nand_dev_id;
 	struct nand_chip *chip = mtd_to_nand(mtd);
@@ -6008,7 +6012,6 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 
 	return 0;
 }
-EXPORT_SYMBOL(nand_scan_ident);
 
 static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
 {
@@ -6385,14 +6388,14 @@ static bool nand_ecc_strength_good(struct mtd_info *mtd)
 }
 
 /**
- * nand_scan_tail - [NAND Interface] Scan for the NAND device
+ * nand_scan_tail - Scan for the NAND device
  * @mtd: MTD device structure
  *
  * This is the second phase of the normal nand_scan() function. It fills out
  * all the uninitialized function pointers with the defaults and scans for a
  * bad block table if appropriate.
  */
-int nand_scan_tail(struct mtd_info *mtd)
+static int nand_scan_tail(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd_to_nand(mtd);
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
@@ -6716,7 +6719,6 @@ err_free_buf:
 
 	return ret;
 }
-EXPORT_SYMBOL(nand_scan_tail);
 
 static int nand_attach(struct nand_chip *chip)
 {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index fbd6b29cf22c..71571ed23a20 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -35,17 +35,6 @@ static inline int nand_scan(struct mtd_info *mtd, int max_chips)
 	return nand_scan_with_ids(mtd, max_chips, NULL);
 }
 
-/*
- * Separate phases of nand_scan(), allowing board driver to intervene
- * and override command or ECC setup according to flash type.
- */
-int nand_scan_ident(struct mtd_info *mtd, int max_chips,
-			   struct nand_flash_dev *table);
-int nand_scan_tail(struct mtd_info *mtd);
-
-/* Unregister the MTD device and free resources held by the NAND device */
-void nand_release(struct mtd_info *mtd);
-
 /* Internal helper for board drivers which need to override command function */
 void nand_wait_ready(struct mtd_info *mtd);
 
@@ -1745,8 +1734,13 @@ int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len,
 int nand_write_data_op(struct nand_chip *chip, const void *buf,
 		       unsigned int len, bool force_8bit);
 
-/* Free resources held by the NAND device */
+/*
+ * Free resources held by the NAND device, must be called on error after a
+ * sucessful nand_scan().
+ */
 void nand_cleanup(struct nand_chip *chip);
+/* Unregister the MTD device and calls nand_cleanup() */
+void nand_release(struct mtd_info *mtd);
 
 /* Default extended ID decoding function */
 void nand_decode_ext_id(struct nand_chip *chip);
-- 
cgit v1.2.3


From 2023f1fa216f30b1877d65be2057fbaf0bbd49b3 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 25 Jul 2018 15:31:51 +0200
Subject: mtd: rawnand: allocate model parameter dynamically

Thanks to the migration of all drivers to use nand_scan() and the
related nand_controller_ops, we can now allocate data during the
detection phase. Let's do it first for the NAND model parameter which
is allocated in nand_detect().

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 52 +++++++++++++++++++++++++++++++---------
 include/linux/mtd/rawnand.h      |  2 +-
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 34ea44f90fd8..00e80781124a 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5225,8 +5225,11 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 
 	sanitize_string(p->manufacturer, sizeof(p->manufacturer));
 	sanitize_string(p->model, sizeof(p->model));
-	strncpy(chip->parameters.model, p->model,
-		sizeof(chip->parameters.model) - 1);
+	chip->parameters.model = kstrdup(p->model, GFP_KERNEL);
+	if (!chip->parameters.model) {
+		ret = -ENOMEM;
+		goto free_onfi_param_page;
+	}
 
 	mtd->writesize = le32_to_cpu(p->byte_per_page);
 
@@ -5356,8 +5359,11 @@ static int nand_flash_detect_jedec(struct nand_chip *chip)
 
 	sanitize_string(p->manufacturer, sizeof(p->manufacturer));
 	sanitize_string(p->model, sizeof(p->model));
-	strncpy(chip->parameters.model, p->model,
-		sizeof(chip->parameters.model) - 1);
+	chip->parameters.model = kstrdup(p->model, GFP_KERNEL);
+	if (!chip->parameters.model) {
+		ret = -ENOMEM;
+		goto free_jedec_param_page;
+	}
 
 	mtd->writesize = le32_to_cpu(p->byte_per_page);
 
@@ -5546,8 +5552,9 @@ static bool find_full_id_nand(struct nand_chip *chip,
 		chip->onfi_timing_mode_default =
 					type->onfi_timing_mode_default;
 
-		strncpy(chip->parameters.model, type->name,
-			sizeof(chip->parameters.model) - 1);
+		chip->parameters.model = kstrdup(type->name, GFP_KERNEL);
+		if (!chip->parameters.model)
+			return false;
 
 		return true;
 	}
@@ -5706,8 +5713,9 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	if (!type->name)
 		return -ENODEV;
 
-	strncpy(chip->parameters.model, type->name,
-		sizeof(chip->parameters.model) - 1);
+	chip->parameters.model = kstrdup(type->name, GFP_KERNEL);
+	if (!chip->parameters.model)
+		return -ENOMEM;
 
 	chip->chipsize = (uint64_t)type->chipsize << 20;
 
@@ -5737,7 +5745,9 @@ ident_done:
 			mtd->name);
 		pr_warn("bus width %d instead of %d bits\n", busw ? 16 : 8,
 			(chip->options & NAND_BUSWIDTH_16) ? 16 : 8);
-		return -EINVAL;
+		ret = -EINVAL;
+
+		goto free_detect_allocation;
 	}
 
 	nand_decode_bbm_options(chip);
@@ -5774,6 +5784,11 @@ ident_done:
 		(int)(chip->chipsize >> 20), nand_is_slc(chip) ? "SLC" : "MLC",
 		mtd->erasesize >> 10, mtd->writesize, mtd->oobsize);
 	return 0;
+
+free_detect_allocation:
+	kfree(chip->parameters.model);
+
+	return ret;
 }
 
 static const char * const nand_ecc_modes[] = {
@@ -6013,6 +6028,11 @@ static int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	return 0;
 }
 
+static void nand_scan_ident_cleanup(struct nand_chip *chip)
+{
+	kfree(chip->parameters.model);
+}
+
 static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
 {
 	struct nand_chip *chip = mtd_to_nand(mtd);
@@ -6760,11 +6780,18 @@ int nand_scan_with_ids(struct mtd_info *mtd, int maxchips,
 
 	ret = nand_attach(chip);
 	if (ret)
-		return ret;
+		goto cleanup_ident;
 
 	ret = nand_scan_tail(mtd);
 	if (ret)
-		nand_detach(chip);
+		goto detach_chip;
+
+	return 0;
+
+detach_chip:
+	nand_detach(chip);
+cleanup_ident:
+	nand_scan_ident_cleanup(chip);
 
 	return ret;
 }
@@ -6796,6 +6823,9 @@ void nand_cleanup(struct nand_chip *chip)
 
 	/* Free controller specific allocations after chip identification */
 	nand_detach(chip);
+
+	/* Free identification phase allocations */
+	nand_scan_ident_cleanup(chip);
 }
 
 EXPORT_SYMBOL_GPL(nand_cleanup);
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 71571ed23a20..099fa166569a 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -476,7 +476,7 @@ struct onfi_params {
  */
 struct nand_parameters {
 	/* Generic parameters */
-	char model[100];
+	const char *model;
 	bool supports_set_get_features;
 	DECLARE_BITMAP(set_feature_list, ONFI_FEATURE_NUMBER);
 	DECLARE_BITMAP(get_feature_list, ONFI_FEATURE_NUMBER);
-- 
cgit v1.2.3


From 11785ef53228d23ec386f5fe4a34601536f0c891 Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Date: Tue, 31 Jul 2018 13:28:42 +0100
Subject: ALSA: usb-audio: Initial Power Domain support

Thee USB Audio Class 3 (UAC3) introduces Power Domains as a new
feature to let a host turn individual parts of an audio function
to different power states via USB requests. This lets the device
get to know a bit amore about what the host is up to in order to
optimize power consumption efficiently.

The Power Domains are optional for UAC3 configuration but all
UAC3 devices shall include at least one BADD configuration where
the support for Power Domains is compulsory.

This patch adds a set of features/helpers to parse these power
domains and change their status.

Signed-off-by: Jorge Sanjuan <jorge.sanjuan@codethink.co.uk>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v3.h |   4 ++
 sound/usb/Makefile           |   1 +
 sound/usb/power.c            | 104 +++++++++++++++++++++++++++++++++++++++++++
 sound/usb/power.h            |  19 ++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 sound/usb/power.c

(limited to 'include/linux')

diff --git a/include/linux/usb/audio-v3.h b/include/linux/usb/audio-v3.h
index 334bfa6dfb47..6b708434b7f9 100644
--- a/include/linux/usb/audio-v3.h
+++ b/include/linux/usb/audio-v3.h
@@ -447,4 +447,8 @@ struct uac3_interrupt_data_msg {
 /* BADD sample rate is always fixed to 48kHz */
 #define UAC3_BADD_SAMPLING_RATE				48000
 
+/* BADD power domains recovery times in 50us increments */
+#define UAC3_BADD_PD_RECOVER_D1D0			0x0258	/* 30ms */
+#define UAC3_BADD_PD_RECOVER_D2D0			0x1770	/* 300ms */
+
 #endif /* __LINUX_USB_AUDIO_V3_H */
diff --git a/sound/usb/Makefile b/sound/usb/Makefile
index 05440e2df8d9..d330f74c90e6 100644
--- a/sound/usb/Makefile
+++ b/sound/usb/Makefile
@@ -13,6 +13,7 @@ snd-usb-audio-objs := 	card.o \
 			mixer_scarlett.o \
 			mixer_us16x08.o \
 			pcm.o \
+			power.o \
 			proc.o \
 			quirks.o \
 			stream.o
diff --git a/sound/usb/power.c b/sound/usb/power.c
new file mode 100644
index 000000000000..bd303a1ba1b7
--- /dev/null
+++ b/sound/usb/power.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *   UAC3 Power Domain state management functions
+ */
+
+#include <linux/slab.h>
+#include <linux/usb.h>
+#include <linux/usb/audio.h>
+#include <linux/usb/audio-v2.h>
+#include <linux/usb/audio-v3.h>
+
+#include "usbaudio.h"
+#include "helper.h"
+#include "power.h"
+
+struct snd_usb_power_domain *
+snd_usb_find_power_domain(struct usb_host_interface *ctrl_iface,
+			  unsigned char id)
+{
+	struct snd_usb_power_domain *pd;
+	void *p;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+
+	p = NULL;
+	while ((p = snd_usb_find_csint_desc(ctrl_iface->extra,
+					    ctrl_iface->extralen,
+					    p, UAC3_POWER_DOMAIN)) != NULL) {
+		struct uac3_power_domain_descriptor *pd_desc = p;
+		int i;
+
+		for (i = 0; i < pd_desc->bNrEntities; i++) {
+			if (pd_desc->baEntityID[i] == id) {
+				pd->pd_id = pd_desc->bPowerDomainID;
+				pd->pd_d1d0_rec =
+					le16_to_cpu(pd_desc->waRecoveryTime1);
+				pd->pd_d2d0_rec =
+					le16_to_cpu(pd_desc->waRecoveryTime2);
+				return pd;
+			}
+		}
+	}
+
+	kfree(pd);
+	return NULL;
+}
+
+int snd_usb_power_domain_set(struct snd_usb_audio *chip,
+			     struct snd_usb_power_domain *pd,
+			     unsigned char state)
+{
+	struct usb_device *dev = chip->dev;
+	unsigned char current_state;
+	int err, idx;
+
+	idx = snd_usb_ctrl_intf(chip) | (pd->pd_id << 8);
+
+	err = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0),
+			      UAC2_CS_CUR,
+			      USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN,
+			      UAC3_AC_POWER_DOMAIN_CONTROL << 8, idx,
+			      &current_state, sizeof(current_state));
+	if (err < 0) {
+		dev_err(&dev->dev, "Can't get UAC3 power state for id %d\n",
+			pd->pd_id);
+		return err;
+	}
+
+	if (current_state == state) {
+		dev_dbg(&dev->dev, "UAC3 power domain id %d already in state %d\n",
+			pd->pd_id, state);
+		return 0;
+	}
+
+	err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC2_CS_CUR,
+			      USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT,
+			      UAC3_AC_POWER_DOMAIN_CONTROL << 8, idx,
+			      &state, sizeof(state));
+	if (err < 0) {
+		dev_err(&dev->dev, "Can't set UAC3 power state to %d for id %d\n",
+			state, pd->pd_id);
+		return err;
+	}
+
+	if (state == UAC3_PD_STATE_D0) {
+		switch (current_state) {
+		case UAC3_PD_STATE_D2:
+			udelay(pd->pd_d2d0_rec * 50);
+			break;
+		case UAC3_PD_STATE_D1:
+			udelay(pd->pd_d1d0_rec * 50);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	dev_dbg(&dev->dev, "UAC3 power domain id %d change to state %d\n",
+		pd->pd_id, state);
+
+	return 0;
+}
diff --git a/sound/usb/power.h b/sound/usb/power.h
index b2e25f60c5a2..6004231a7c75 100644
--- a/sound/usb/power.h
+++ b/sound/usb/power.h
@@ -2,6 +2,25 @@
 #ifndef __USBAUDIO_POWER_H
 #define __USBAUDIO_POWER_H
 
+struct snd_usb_power_domain {
+	int pd_id;              /* UAC3 Power Domain ID */
+	int pd_d1d0_rec;        /* D1 to D0 recovery time */
+	int pd_d2d0_rec;        /* D2 to D0 recovery time */
+};
+
+enum {
+	UAC3_PD_STATE_D0,
+	UAC3_PD_STATE_D1,
+	UAC3_PD_STATE_D2,
+};
+
+int snd_usb_power_domain_set(struct snd_usb_audio *chip,
+			     struct snd_usb_power_domain *pd,
+			     unsigned char state);
+struct snd_usb_power_domain *
+snd_usb_find_power_domain(struct usb_host_interface *ctrl_iface,
+			  unsigned char id);
+
 #ifdef CONFIG_PM
 int snd_usb_autoresume(struct snd_usb_audio *chip);
 void snd_usb_autosuspend(struct snd_usb_audio *chip);
-- 
cgit v1.2.3


From 08fcf813281ebcf72c69487c1501ad91b7121cdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 31 Jul 2018 09:10:26 -0600
Subject: t10-pi: provide empty t10_pi_complete() for !CONFIG_BLK_DEV_INTEGRITY

Fixes a link failure whtn BLK_DEV_INTEGRITY isn't defined.

Fixes: 10c41ddd6132 ("block: move dif_prepare/dif_complete functions to block layer")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/t10-pi.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 5a427c289f58..b9626aa7e90c 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -51,8 +51,19 @@ extern const struct blk_integrity_profile t10_pi_type1_crc;
 extern const struct blk_integrity_profile t10_pi_type1_ip;
 extern const struct blk_integrity_profile t10_pi_type3_crc;
 extern const struct blk_integrity_profile t10_pi_type3_ip;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
 extern void t10_pi_prepare(struct request *rq, u8 protection_type);
 extern void t10_pi_complete(struct request *rq, u8 protection_type,
 			    unsigned int intervals);
+#else
+static inline void t10_pi_complete(struct request *rq, u8 protection_type,
+				   unsigned int intervals)
+{
+}
+static inline void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From c29c2ebd2ae0066c026045a21aa33ccbfcd8bb3c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 30 Jul 2018 20:43:51 -0700
Subject: net: update real_num_rx_queues even when !CONFIG_SYSFS

We used to depend on real_num_rx_queues as a upper bound for sanity
checks.  For AF_XDP socket validation it's useful if the check behaves
the same regardless of CONFIG_SYSFS setting.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c917467a2c7..3bf7e93c9e96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3431,8 +3431,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
 #else
 static inline int netif_set_real_num_rx_queues(struct net_device *dev,
-						unsigned int rxq)
+						unsigned int rxqs)
 {
+	dev->real_num_rx_queues = rxqs;
 	return 0;
 }
 #endif
-- 
cgit v1.2.3


From 84c6b86875e01a08a0daa6fdd4a01b36bf0bf0b2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 30 Jul 2018 20:43:53 -0700
Subject: xsk: don't allow umem replace at stack level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently drivers have to check if they already have a umem
installed for a given queue and return an error if so.  Make
better use of XDP_QUERY_XSK_UMEM and move this functionality
to the core.

We need to keep rtnl across the calls now.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Björn Töpel <bjorn.topel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 ++++---
 net/xdp/xdp_umem.c        | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3bf7e93c9e96..282e2e95ad5b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -872,10 +872,10 @@ struct netdev_bpf {
 		struct {
 			struct bpf_offloaded_map *offmap;
 		};
-		/* XDP_SETUP_XSK_UMEM */
+		/* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */
 		struct {
-			struct xdp_umem *umem;
-			u16 queue_id;
+			struct xdp_umem *umem; /* out for query*/
+			u16 queue_id; /* in for query */
 		} xsk;
 	};
 };
@@ -3568,6 +3568,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
 u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
 		    enum bpf_netdev_command cmd);
+int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index c199d66b5f3f..911ca6d3cb5a 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -11,6 +11,8 @@
 #include <linux/slab.h>
 #include <linux/bpf.h>
 #include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
 
 #include "xdp_umem.h"
 #include "xsk_queue.h"
@@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
 	}
 }
 
+int xdp_umem_query(struct net_device *dev, u16 queue_id)
+{
+	struct netdev_bpf bpf;
+
+	ASSERT_RTNL();
+
+	memset(&bpf, 0, sizeof(bpf));
+	bpf.command = XDP_QUERY_XSK_UMEM;
+	bpf.xsk.queue_id = queue_id;
+
+	if (!dev->netdev_ops->ndo_bpf)
+		return 0;
+	return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags)
 {
@@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	bpf.command = XDP_QUERY_XSK_UMEM;
 
 	rtnl_lock();
-	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-	rtnl_unlock();
-
-	if (err)
-		return force_zc ? -ENOTSUPP : 0;
+	err = xdp_umem_query(dev, queue_id);
+	if (err) {
+		err = err < 0 ? -ENOTSUPP : -EBUSY;
+		goto err_rtnl_unlock;
+	}
 
 	bpf.command = XDP_SETUP_XSK_UMEM;
 	bpf.xsk.umem = umem;
 	bpf.xsk.queue_id = queue_id;
 
-	rtnl_lock();
 	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
-	rtnl_unlock();
-
 	if (err)
-		return force_zc ? err : 0; /* fail or fallback */
+		goto err_rtnl_unlock;
+	rtnl_unlock();
 
 	dev_hold(dev);
 	umem->dev = dev;
 	umem->queue_id = queue_id;
 	umem->zc = true;
 	return 0;
+
+err_rtnl_unlock:
+	rtnl_unlock();
+	return force_zc ? err : 0; /* fail or fallback */
 }
 
 static void xdp_umem_clear_dev(struct xdp_umem *umem)
-- 
cgit v1.2.3


From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 30 Jul 2018 09:45:07 +0200
Subject: net: remove bogus RCU annotations on socket.wq

We never use RCU protection for it, just a lot of cargo-cult
rcu_deference_protects calls.

Note that we do keep the kfree_rcu call for it, as the references through
struct sock are RCU protected and thus might require a grace period before
freeing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  2 +-
 include/net/sock.h  |  2 +-
 net/socket.c        | 10 ++++------
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 6554d3ba4396..e0930678c8bf 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -114,7 +114,7 @@ struct socket {
 
 	unsigned long		flags;
 
-	struct socket_wq __rcu	*wq;
+	struct socket_wq	*wq;
 
 	struct file		*file;
 	struct sock		*sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index 2afea5d1bdfe..433f45fc2d68 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1788,7 +1788,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 {
 	WARN_ON(parent->sk);
 	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_wq = parent->wq;
+	rcu_assign_pointer(sk->sk_wq, parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
 	sk->sk_uid = SOCK_INODE(parent)->i_uid;
diff --git a/net/socket.c b/net/socket.c
index 5b7df6695f4f..475247e347ae 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -251,7 +251,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	init_waitqueue_head(&wq->wait);
 	wq->fasync_list = NULL;
 	wq->flags = 0;
-	RCU_INIT_POINTER(ei->socket.wq, wq);
+	ei->socket.wq = wq;
 
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
@@ -265,11 +265,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 static void sock_destroy_inode(struct inode *inode)
 {
 	struct socket_alloc *ei;
-	struct socket_wq *wq;
 
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	wq = rcu_dereference_protected(ei->socket.wq, 1);
-	kfree_rcu(wq, rcu);
+	kfree_rcu(ei->socket.wq, rcu);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -603,7 +601,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
 		module_put(owner);
 	}
 
-	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+	if (sock->wq->fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
 	if (!sock->file) {
@@ -1181,7 +1179,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
+	wq = sock->wq;
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
cgit v1.2.3


From 546c596cf5491fda1516536e049c6a36836eb884 Mon Sep 17 00:00:00 2001
From: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Date: Wed, 18 Jul 2018 09:40:35 +0800
Subject: PCI: Unify PCI and normal DMA direction definitions

Current DMA direction definitions in pci-dma-compat.h and dma-direction.h
are mirrored in value.  Unify them to enhance readability and avoid
possible inconsistency.

Signed-off-by: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Joey Zheng <yu.zheng@hxt-semitech.com>
---
 include/linux/dma-direction.h  | 6 ++----
 include/linux/pci-dma-compat.h | 8 ++++----
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-direction.h b/include/linux/dma-direction.h
index 3649a031893a..9c96e30e6a0b 100644
--- a/include/linux/dma-direction.h
+++ b/include/linux/dma-direction.h
@@ -1,14 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_DMA_DIRECTION_H
 #define _LINUX_DMA_DIRECTION_H
-/*
- * These definitions mirror those in pci.h, so they can be used
- * interchangeably with their PCI_ counterparts.
- */
+
 enum dma_data_direction {
 	DMA_BIDIRECTIONAL = 0,
 	DMA_TO_DEVICE = 1,
 	DMA_FROM_DEVICE = 2,
 	DMA_NONE = 3,
 };
+
 #endif
diff --git a/include/linux/pci-dma-compat.h b/include/linux/pci-dma-compat.h
index 0dd1a3f7b309..c3f1b44ade29 100644
--- a/include/linux/pci-dma-compat.h
+++ b/include/linux/pci-dma-compat.h
@@ -8,10 +8,10 @@
 #include <linux/dma-mapping.h>
 
 /* This defines the direction arg to the DMA mapping routines. */
-#define PCI_DMA_BIDIRECTIONAL	0
-#define PCI_DMA_TODEVICE	1
-#define PCI_DMA_FROMDEVICE	2
-#define PCI_DMA_NONE		3
+#define PCI_DMA_BIDIRECTIONAL	DMA_BIDIRECTIONAL
+#define PCI_DMA_TODEVICE	DMA_TO_DEVICE
+#define PCI_DMA_FROMDEVICE	DMA_FROM_DEVICE
+#define PCI_DMA_NONE		DMA_NONE
 
 static inline void *
 pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
-- 
cgit v1.2.3


From bb276262e88dae52cc717bb636b7468f66bb234e Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 27 Jul 2018 11:33:13 -0700
Subject: mtd: spi-nor: only apply reset hacks to broken hardware

Commit 59b356ffd0b0 ("mtd: m25p80: restore the status of SPI flash when
exiting") is the latest from a long history of attempts to add reboot
handling to handle stateful addressing modes on SPI flash. Some prior
mostly-related discussions:

http://lists.infradead.org/pipermail/linux-mtd/2013-March/046343.html
[PATCH 1/3] mtd: m25p80: utilize dedicated 4-byte addressing commands

http://lists.infradead.org/pipermail/barebox/2014-September/020682.html
[RFC] MTD m25p80 3-byte addressing and boot problem

http://lists.infradead.org/pipermail/linux-mtd/2015-February/057683.html
[PATCH 2/2] m25p80: if supported put chip to deep power down if not used

Previously, attempts to add reboot-time software reset handling were
rejected, but the latest attempt was not.

Quick summary of the problem:
Some systems (e.g., boot ROM or bootloader) assume that they can read
initial boot code from their SPI flash using 3-byte addressing. If the
flash is left in 4-byte mode after reset, these systems won't boot. The
above patch provided a shutdown/remove hook to attempt to reset the
addressing mode before we reboot. Notably, this patch misses out on
huge classes of unexpected reboots (e.g., crashes, watchdog resets).

Unfortunately, it is essentially impossible to solve this problem 100%:
if your system doesn't know how to reset the SPI flash to power-on
defaults at initialization time, no amount of software can really rescue
you -- there will always be a chance of some unexpected reset that
leaves your flash in an addressing mode that your boot sequence didn't
expect.

While it is not directly harmful to perform hacks like the
aforementioned commit on all 4-byte addressing flash, a
properly-designed system should not need the hack -- and in fact,
providing this hack may mask the fact that a given system is indeed
broken. So this patch attempts to apply this unsound hack more narrowly,
providing a strong suggestion to developers and system designers that
this is truly a hack. With luck, system designers can catch their errors
early on in their development cycle, rather than applying this hack long
term. But apparently enough systems are out in the wild that we still
have to provide this hack.

Document a new device tree property to denote systems that do not have a
proper hardware (or software) reset mechanism, and apply the hack (with
a loud warning) only in this case.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 .../devicetree/bindings/mtd/jedec,spi-nor.txt          |  9 +++++++++
 drivers/mtd/spi-nor/spi-nor.c                          | 18 ++++++++++++++++--
 include/linux/mtd/spi-nor.h                            |  1 +
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
index 956bb046e599..f03be904d3c2 100644
--- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
+++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
@@ -69,6 +69,15 @@ Optional properties:
                    all chips and support for it can not be detected at runtime.
                    Refer to your chips' datasheet to check if this is supported
                    by your chip.
+- broken-flash-reset : Some flash devices utilize stateful addressing modes
+		   (e.g., for 32-bit addressing) which need to be managed
+		   carefully by a system. Because these sorts of flash don't
+		   have a standardized software reset command, and because some
+		   systems don't toggle the flash RESET# pin upon system reset
+		   (if the pin even exists at all), there are systems which
+		   cannot reboot properly if the flash is left in the "wrong"
+		   state. This boolean flag can be used on such systems, to
+		   denote the absence of a reliable reset mechanism.
 
 Example:
 
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index d9c368c44194..f028277fb1ce 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -2757,8 +2757,18 @@ static int spi_nor_init(struct spi_nor *nor)
 
 	if ((nor->addr_width == 4) &&
 	    (JEDEC_MFR(nor->info) != SNOR_MFR_SPANSION) &&
-	    !(nor->info->flags & SPI_NOR_4B_OPCODES))
+	    !(nor->info->flags & SPI_NOR_4B_OPCODES)) {
+		/*
+		 * If the RESET# pin isn't hooked up properly, or the system
+		 * otherwise doesn't perform a reset command in the boot
+		 * sequence, it's impossible to 100% protect against unexpected
+		 * reboots (e.g., crashes). Warn the user (or hopefully, system
+		 * designer) that this is bad.
+		 */
+		WARN_ONCE(nor->flags & SNOR_F_BROKEN_RESET,
+			  "enabling reset hack; may not recover from unexpected reboots\n");
 		set_4byte(nor, nor->info, 1);
+	}
 
 	return 0;
 }
@@ -2781,7 +2791,8 @@ void spi_nor_restore(struct spi_nor *nor)
 	/* restore the addressing mode */
 	if ((nor->addr_width == 4) &&
 	    (JEDEC_MFR(nor->info) != SNOR_MFR_SPANSION) &&
-	    !(nor->info->flags & SPI_NOR_4B_OPCODES))
+	    !(nor->info->flags & SPI_NOR_4B_OPCODES) &&
+	    (nor->flags & SNOR_F_BROKEN_RESET))
 		set_4byte(nor, nor->info, 0);
 }
 EXPORT_SYMBOL_GPL(spi_nor_restore);
@@ -2911,6 +2922,9 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 		params.hwcaps.mask |= SNOR_HWCAPS_READ_FAST;
 	}
 
+	if (of_property_read_bool(np, "broken-flash-reset"))
+		nor->flags |= SNOR_F_BROKEN_RESET;
+
 	/* Some devices cannot do fast-read, no matter what DT tells us */
 	if (info->flags & SPI_NOR_NO_FR)
 		params.hwcaps.mask &= ~SNOR_HWCAPS_READ_FAST;
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index e60da0d34cc1..c922e97f205a 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -235,6 +235,7 @@ enum spi_nor_option_flags {
 	SNOR_F_S3AN_ADDR_DEFAULT = BIT(3),
 	SNOR_F_READY_XSR_RDY	= BIT(4),
 	SNOR_F_USE_CLSR		= BIT(5),
+	SNOR_F_BROKEN_RESET	= BIT(6),
 };
 
 /**
-- 
cgit v1.2.3


From 3d3fe3c05d5a17ebdf55b936c51017c127c0ed44 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 25 Jul 2018 15:31:52 +0200
Subject: mtd: rawnand: allocate dynamically ONFI parameters during detection

Now that it is possible to do dynamic allocations during the
identification phase, convert the onfi_params structure (which is only
needed with ONFI compliant chips) into a pointer that will be allocated
only if needed.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c    | 54 +++++++++++++++++++++++--------------
 drivers/mtd/nand/raw/nand_micron.c  |  7 ++---
 drivers/mtd/nand/raw/nand_timings.c | 12 ++++-----
 include/linux/mtd/rawnand.h         |  6 ++---
 4 files changed, 47 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 00e80781124a..d527e448ce19 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5151,6 +5151,8 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct nand_onfi_params *p;
+	struct onfi_params *onfi;
+	int onfi_version = 0;
 	char id[4];
 	int i, ret, val;
 
@@ -5206,21 +5208,19 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 	/* Check version */
 	val = le16_to_cpu(p->revision);
 	if (val & ONFI_VERSION_2_3)
-		chip->parameters.onfi.version = 23;
+		onfi_version = 23;
 	else if (val & ONFI_VERSION_2_2)
-		chip->parameters.onfi.version = 22;
+		onfi_version = 22;
 	else if (val & ONFI_VERSION_2_1)
-		chip->parameters.onfi.version = 21;
+		onfi_version = 21;
 	else if (val & ONFI_VERSION_2_0)
-		chip->parameters.onfi.version = 20;
+		onfi_version = 20;
 	else if (val & ONFI_VERSION_1_0)
-		chip->parameters.onfi.version = 10;
+		onfi_version = 10;
 
-	if (!chip->parameters.onfi.version) {
+	if (!onfi_version) {
 		pr_info("unsupported ONFI version: %d\n", val);
 		goto free_onfi_param_page;
-	} else {
-		ret = 1;
 	}
 
 	sanitize_string(p->manufacturer, sizeof(p->manufacturer));
@@ -5257,7 +5257,7 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 	if (p->ecc_bits != 0xff) {
 		chip->ecc_strength_ds = p->ecc_bits;
 		chip->ecc_step_ds = 512;
-	} else if (chip->parameters.onfi.version >= 21 &&
+	} else if (onfi_version >= 21 &&
 		(le16_to_cpu(p->features) & ONFI_FEATURE_EXT_PARAM_PAGE)) {
 
 		/*
@@ -5284,19 +5284,33 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 		bitmap_set(chip->parameters.set_feature_list,
 			   ONFI_FEATURE_ADDR_TIMING_MODE, 1);
 	}
-	chip->parameters.onfi.tPROG = le16_to_cpu(p->t_prog);
-	chip->parameters.onfi.tBERS = le16_to_cpu(p->t_bers);
-	chip->parameters.onfi.tR = le16_to_cpu(p->t_r);
-	chip->parameters.onfi.tCCS = le16_to_cpu(p->t_ccs);
-	chip->parameters.onfi.async_timing_mode =
-		le16_to_cpu(p->async_timing_mode);
-	chip->parameters.onfi.vendor_revision =
-		le16_to_cpu(p->vendor_revision);
-	memcpy(chip->parameters.onfi.vendor, p->vendor,
-	       sizeof(p->vendor));
 
+	onfi = kzalloc(sizeof(*onfi), GFP_KERNEL);
+	if (!onfi) {
+		ret = -ENOMEM;
+		goto free_model;
+	}
+
+	onfi->version = onfi_version;
+	onfi->tPROG = le16_to_cpu(p->t_prog);
+	onfi->tBERS = le16_to_cpu(p->t_bers);
+	onfi->tR = le16_to_cpu(p->t_r);
+	onfi->tCCS = le16_to_cpu(p->t_ccs);
+	onfi->async_timing_mode = le16_to_cpu(p->async_timing_mode);
+	onfi->vendor_revision = le16_to_cpu(p->vendor_revision);
+	memcpy(onfi->vendor, p->vendor, sizeof(p->vendor));
+	chip->parameters.onfi = onfi;
+
+	/* Identification done, free the full ONFI parameter page and exit */
+	kfree(p);
+
+	return 1;
+
+free_model:
+	kfree(chip->parameters.model);
 free_onfi_param_page:
 	kfree(p);
+
 	return ret;
 }
 
@@ -5693,7 +5707,6 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 		}
 	}
 
-	chip->parameters.onfi.version = 0;
 	if (!type->name || !type->pagesize) {
 		/* Check if the chip is ONFI compliant */
 		ret = nand_flash_detect_onfi(chip);
@@ -6031,6 +6044,7 @@ static int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 static void nand_scan_ident_cleanup(struct nand_chip *chip)
 {
 	kfree(chip->parameters.model);
+	kfree(chip->parameters.onfi);
 }
 
 static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
diff --git a/drivers/mtd/nand/raw/nand_micron.c b/drivers/mtd/nand/raw/nand_micron.c
index 656947d91841..f5dc0a7a2456 100644
--- a/drivers/mtd/nand/raw/nand_micron.c
+++ b/drivers/mtd/nand/raw/nand_micron.c
@@ -88,9 +88,10 @@ static int micron_nand_setup_read_retry(struct mtd_info *mtd, int retry_mode)
 static int micron_nand_onfi_init(struct nand_chip *chip)
 {
 	struct nand_parameters *p = &chip->parameters;
-	struct nand_onfi_vendor_micron *micron = (void *)p->onfi.vendor;
 
-	if (chip->parameters.onfi.version && p->onfi.vendor_revision) {
+	if (p->onfi) {
+		struct nand_onfi_vendor_micron *micron = (void *)p->onfi->vendor;
+
 		chip->read_retries = micron->read_retry_options;
 		chip->setup_read_retry = micron_nand_setup_read_retry;
 	}
@@ -382,7 +383,7 @@ static int micron_supports_on_die_ecc(struct nand_chip *chip)
 	u8 id[5];
 	int ret;
 
-	if (!chip->parameters.onfi.version)
+	if (!chip->parameters.onfi)
 		return MICRON_ON_DIE_UNSUPPORTED;
 
 	if (chip->bits_per_cell != 1)
diff --git a/drivers/mtd/nand/raw/nand_timings.c b/drivers/mtd/nand/raw/nand_timings.c
index 9bb599106a31..ebc7b5f76f77 100644
--- a/drivers/mtd/nand/raw/nand_timings.c
+++ b/drivers/mtd/nand/raw/nand_timings.c
@@ -294,6 +294,7 @@ int onfi_fill_data_interface(struct nand_chip *chip,
 			     int timing_mode)
 {
 	struct nand_data_interface *iface = &chip->data_interface;
+	struct onfi_params *onfi = chip->parameters.onfi;
 
 	if (type != NAND_SDR_IFACE)
 		return -EINVAL;
@@ -308,17 +309,16 @@ int onfi_fill_data_interface(struct nand_chip *chip,
 	 * tPROG, tBERS, tR and tCCS.
 	 * These information are part of the ONFI parameter page.
 	 */
-	if (chip->parameters.onfi.version) {
-		struct nand_parameters *params = &chip->parameters;
+	if (onfi) {
 		struct nand_sdr_timings *timings = &iface->timings.sdr;
 
 		/* microseconds -> picoseconds */
-		timings->tPROG_max = 1000000ULL * params->onfi.tPROG;
-		timings->tBERS_max = 1000000ULL * params->onfi.tBERS;
-		timings->tR_max = 1000000ULL * params->onfi.tR;
+		timings->tPROG_max = 1000000ULL * onfi->tPROG;
+		timings->tBERS_max = 1000000ULL * onfi->tBERS;
+		timings->tR_max = 1000000ULL * onfi->tR;
 
 		/* nanoseconds -> picoseconds */
-		timings->tCCS_min = 1000UL * params->onfi.tCCS;
+		timings->tCCS_min = 1000UL * onfi->tCCS;
 	} else {
 		struct nand_sdr_timings *timings = &iface->timings.sdr;
 		/*
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 099fa166569a..efb2345359bb 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -482,7 +482,7 @@ struct nand_parameters {
 	DECLARE_BITMAP(get_feature_list, ONFI_FEATURE_NUMBER);
 
 	/* ONFI parameters */
-	struct onfi_params onfi;
+	struct onfi_params *onfi;
 };
 
 /* The maximum expected count of bytes in the NAND ID sequence */
@@ -1618,10 +1618,10 @@ struct platform_nand_data {
 /* return the supported asynchronous timing mode. */
 static inline int onfi_get_async_timing_mode(struct nand_chip *chip)
 {
-	if (!chip->parameters.onfi.version)
+	if (!chip->parameters.onfi)
 		return ONFI_TIMING_MODE_UNKNOWN;
 
-	return chip->parameters.onfi.async_timing_mode;
+	return chip->parameters.onfi->async_timing_mode;
 }
 
 int onfi_fill_data_interface(struct nand_chip *chip,
-- 
cgit v1.2.3


From 304d34360b099020a12af2abb7e1ac506f4ba16d Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Sat, 28 Jul 2018 10:19:13 +0200
Subject: spi: add flags parameter to txrx_word function pointers

Add the capability to specify the flag parameter used in
bitbang_txrx_be_cpha{0,1} through the txrx_word function pointers of
spi_bitbang data structure. That feature will be used to add spi-3wire
support to the spi-gpio controller

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-ath79.c         |  2 +-
 drivers/spi/spi-bitbang.c       | 34 +++++++++++++++++++++-------------
 drivers/spi/spi-butterfly.c     |  4 ++--
 drivers/spi/spi-gpio.c          | 32 ++++++++++++++++----------------
 drivers/spi/spi-lm70llp.c       |  5 +++--
 drivers/spi/spi-sh-sci.c        | 20 ++++++++++++--------
 drivers/spi/spi-xtensa-xtfpga.c |  2 +-
 include/linux/spi/spi_bitbang.h |  2 +-
 8 files changed, 57 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-ath79.c b/drivers/spi/spi-ath79.c
index 0719bd484891..3f6b657394de 100644
--- a/drivers/spi/spi-ath79.c
+++ b/drivers/spi/spi-ath79.c
@@ -176,7 +176,7 @@ static void ath79_spi_cleanup(struct spi_device *spi)
 }
 
 static u32 ath79_spi_txrx_mode0(struct spi_device *spi, unsigned int nsecs,
-			       u32 word, u8 bits)
+			       u32 word, u8 bits, unsigned flags)
 {
 	struct ath79_spi *sp = ath79_spidev_to_sp(spi);
 	u32 ioc = sp->ioc_base;
diff --git a/drivers/spi/spi-bitbang.c b/drivers/spi/spi-bitbang.c
index 3aa9e6e3dac8..76f1b534bdd7 100644
--- a/drivers/spi/spi-bitbang.c
+++ b/drivers/spi/spi-bitbang.c
@@ -49,22 +49,26 @@
 struct spi_bitbang_cs {
 	unsigned	nsecs;	/* (clock cycle time)/2 */
 	u32		(*txrx_word)(struct spi_device *spi, unsigned nsecs,
-					u32 word, u8 bits);
+					u32 word, u8 bits, unsigned flags);
 	unsigned	(*txrx_bufs)(struct spi_device *,
 					u32 (*txrx_word)(
 						struct spi_device *spi,
 						unsigned nsecs,
-						u32 word, u8 bits),
-					unsigned, struct spi_transfer *);
+						u32 word, u8 bits,
+						unsigned flags),
+					unsigned, struct spi_transfer *,
+					unsigned);
 };
 
 static unsigned bitbang_txrx_8(
 	struct spi_device	*spi,
 	u32			(*txrx_word)(struct spi_device *spi,
 					unsigned nsecs,
-					u32 word, u8 bits),
+					u32 word, u8 bits,
+					unsigned flags),
 	unsigned		ns,
-	struct spi_transfer	*t
+	struct spi_transfer	*t,
+	unsigned flags
 ) {
 	unsigned		bits = t->bits_per_word;
 	unsigned		count = t->len;
@@ -76,7 +80,7 @@ static unsigned bitbang_txrx_8(
 
 		if (tx)
 			word = *tx++;
-		word = txrx_word(spi, ns, word, bits);
+		word = txrx_word(spi, ns, word, bits, flags);
 		if (rx)
 			*rx++ = word;
 		count -= 1;
@@ -88,9 +92,11 @@ static unsigned bitbang_txrx_16(
 	struct spi_device	*spi,
 	u32			(*txrx_word)(struct spi_device *spi,
 					unsigned nsecs,
-					u32 word, u8 bits),
+					u32 word, u8 bits,
+					unsigned flags),
 	unsigned		ns,
-	struct spi_transfer	*t
+	struct spi_transfer	*t,
+	unsigned flags
 ) {
 	unsigned		bits = t->bits_per_word;
 	unsigned		count = t->len;
@@ -102,7 +108,7 @@ static unsigned bitbang_txrx_16(
 
 		if (tx)
 			word = *tx++;
-		word = txrx_word(spi, ns, word, bits);
+		word = txrx_word(spi, ns, word, bits, flags);
 		if (rx)
 			*rx++ = word;
 		count -= 2;
@@ -114,9 +120,11 @@ static unsigned bitbang_txrx_32(
 	struct spi_device	*spi,
 	u32			(*txrx_word)(struct spi_device *spi,
 					unsigned nsecs,
-					u32 word, u8 bits),
+					u32 word, u8 bits,
+					unsigned flags),
 	unsigned		ns,
-	struct spi_transfer	*t
+	struct spi_transfer	*t,
+	unsigned flags
 ) {
 	unsigned		bits = t->bits_per_word;
 	unsigned		count = t->len;
@@ -128,7 +136,7 @@ static unsigned bitbang_txrx_32(
 
 		if (tx)
 			word = *tx++;
-		word = txrx_word(spi, ns, word, bits);
+		word = txrx_word(spi, ns, word, bits, flags);
 		if (rx)
 			*rx++ = word;
 		count -= 4;
@@ -236,7 +244,7 @@ static int spi_bitbang_bufs(struct spi_device *spi, struct spi_transfer *t)
 	struct spi_bitbang_cs	*cs = spi->controller_state;
 	unsigned		nsecs = cs->nsecs;
 
-	return cs->txrx_bufs(spi, cs->txrx_word, nsecs, t);
+	return cs->txrx_bufs(spi, cs->txrx_word, nsecs, t, 0);
 }
 
 /*----------------------------------------------------------------------*/
diff --git a/drivers/spi/spi-butterfly.c b/drivers/spi/spi-butterfly.c
index 22a31e4a1a11..1a3510215841 100644
--- a/drivers/spi/spi-butterfly.c
+++ b/drivers/spi/spi-butterfly.c
@@ -144,9 +144,9 @@ static void butterfly_chipselect(struct spi_device *spi, int value)
 
 static u32
 butterfly_txrx_word_mode0(struct spi_device *spi, unsigned nsecs, u32 word,
-			  u8 bits)
+			  u8 bits, unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 0, flags, word, bits);
 }
 
 /*----------------------------------------------------------------------*/
diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index 6ae92d4dca19..be68298cbd9c 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -149,27 +149,27 @@ static inline int getmiso(const struct spi_device *spi)
  */
 
 static u32 spi_gpio_txrx_word_mode0(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 spi_gpio_txrx_word_mode1(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	return bitbang_txrx_be_cpha1(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha1(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 spi_gpio_txrx_word_mode2(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 1, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 1, flags, word, bits);
 }
 
 static u32 spi_gpio_txrx_word_mode3(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	return bitbang_txrx_be_cpha1(spi, nsecs, 1, 0, word, bits);
+	return bitbang_txrx_be_cpha1(spi, nsecs, 1, flags, word, bits);
 }
 
 /*
@@ -183,30 +183,30 @@ static u32 spi_gpio_txrx_word_mode3(struct spi_device *spi,
  */
 
 static u32 spi_gpio_spec_txrx_word_mode0(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	unsigned flags = spi->master->flags;
+	flags = spi->master->flags;
 	return bitbang_txrx_be_cpha0(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 spi_gpio_spec_txrx_word_mode1(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	unsigned flags = spi->master->flags;
+	flags = spi->master->flags;
 	return bitbang_txrx_be_cpha1(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 spi_gpio_spec_txrx_word_mode2(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	unsigned flags = spi->master->flags;
+	flags = spi->master->flags;
 	return bitbang_txrx_be_cpha0(spi, nsecs, 1, flags, word, bits);
 }
 
 static u32 spi_gpio_spec_txrx_word_mode3(struct spi_device *spi,
-		unsigned nsecs, u32 word, u8 bits)
+		unsigned nsecs, u32 word, u8 bits, unsigned flags)
 {
-	unsigned flags = spi->master->flags;
+	flags = spi->master->flags;
 	return bitbang_txrx_be_cpha1(spi, nsecs, 1, flags, word, bits);
 }
 
diff --git a/drivers/spi/spi-lm70llp.c b/drivers/spi/spi-lm70llp.c
index 61ee0f4269ae..4549efd792da 100644
--- a/drivers/spi/spi-lm70llp.c
+++ b/drivers/spi/spi-lm70llp.c
@@ -188,9 +188,10 @@ static void lm70_chipselect(struct spi_device *spi, int value)
 /*
  * Our actual bitbanger routine.
  */
-static u32 lm70_txrx(struct spi_device *spi, unsigned nsecs, u32 word, u8 bits)
+static u32 lm70_txrx(struct spi_device *spi, unsigned nsecs, u32 word, u8 bits,
+		     unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 0, flags, word, bits);
 }
 
 static void spi_lm70llp_attach(struct parport *p)
diff --git a/drivers/spi/spi-sh-sci.c b/drivers/spi/spi-sh-sci.c
index a9beeeed812c..393701cfca3c 100644
--- a/drivers/spi/spi-sh-sci.c
+++ b/drivers/spi/spi-sh-sci.c
@@ -80,27 +80,31 @@ static inline u32 getmiso(struct spi_device *dev)
 #include "spi-bitbang-txrx.h"
 
 static u32 sh_sci_spi_txrx_mode0(struct spi_device *spi,
-				      unsigned nsecs, u32 word, u8 bits)
+				 unsigned nsecs, u32 word, u8 bits,
+				 unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 sh_sci_spi_txrx_mode1(struct spi_device *spi,
-				      unsigned nsecs, u32 word, u8 bits)
+				 unsigned nsecs, u32 word, u8 bits,
+				 unsigned flags)
 {
-	return bitbang_txrx_be_cpha1(spi, nsecs, 0, 0, word, bits);
+	return bitbang_txrx_be_cpha1(spi, nsecs, 0, flags, word, bits);
 }
 
 static u32 sh_sci_spi_txrx_mode2(struct spi_device *spi,
-				      unsigned nsecs, u32 word, u8 bits)
+				 unsigned nsecs, u32 word, u8 bits,
+				 unsigned flags)
 {
-	return bitbang_txrx_be_cpha0(spi, nsecs, 1, 0, word, bits);
+	return bitbang_txrx_be_cpha0(spi, nsecs, 1, flags, word, bits);
 }
 
 static u32 sh_sci_spi_txrx_mode3(struct spi_device *spi,
-				      unsigned nsecs, u32 word, u8 bits)
+				 unsigned nsecs, u32 word, u8 bits,
+				 unsigned flags)
 {
-	return bitbang_txrx_be_cpha1(spi, nsecs, 1, 0, word, bits);
+	return bitbang_txrx_be_cpha1(spi, nsecs, 1, flags, word, bits);
 }
 
 static void sh_sci_spi_chipselect(struct spi_device *dev, int value)
diff --git a/drivers/spi/spi-xtensa-xtfpga.c b/drivers/spi/spi-xtensa-xtfpga.c
index be6155cba9de..8ce04f829a80 100644
--- a/drivers/spi/spi-xtensa-xtfpga.c
+++ b/drivers/spi/spi-xtensa-xtfpga.c
@@ -54,7 +54,7 @@ static inline void xtfpga_spi_wait_busy(struct xtfpga_spi *xspi)
 }
 
 static u32 xtfpga_spi_txrx_word(struct spi_device *spi, unsigned nsecs,
-				u32 v, u8 bits)
+				u32 v, u8 bits, unsigned flags)
 {
 	struct xtfpga_spi *xspi = spi_master_get_devdata(spi->master);
 
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index c1e61641a7f1..5847f00ef9cf 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -30,7 +30,7 @@ struct spi_bitbang {
 	/* txrx_word[SPI_MODE_*]() just looks like a shift register */
 	u32	(*txrx_word[4])(struct spi_device *spi,
 			unsigned nsecs,
-			u32 word, u8 bits);
+			u32 word, u8 bits, unsigned flags);
 };
 
 /* you can call these default bitbang->master methods from your custom
-- 
cgit v1.2.3


From 4b859db2c60692560afbfef1b030d0ddef57b7ee Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Sat, 28 Jul 2018 10:19:14 +0200
Subject: spi: spi-gpio: add SPI_3WIRE support

Add SPI_3WIRE support to spi-gpio controller introducing
set_line_direction function pointer in spi_bitbang data structure.
Spi-gpio controller has been tested using hts221 temp/rh iio sensor
running in 3wire mode and lsm6dsm running in 4wire mode

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-bitbang.c       | 16 ++++++++++++++++
 drivers/spi/spi-gpio.c          | 17 ++++++++++++++++-
 include/linux/spi/spi_bitbang.h |  1 +
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-bitbang.c b/drivers/spi/spi-bitbang.c
index 76f1b534bdd7..f29176000b8d 100644
--- a/drivers/spi/spi-bitbang.c
+++ b/drivers/spi/spi-bitbang.c
@@ -243,7 +243,23 @@ static int spi_bitbang_bufs(struct spi_device *spi, struct spi_transfer *t)
 {
 	struct spi_bitbang_cs	*cs = spi->controller_state;
 	unsigned		nsecs = cs->nsecs;
+	struct spi_bitbang	*bitbang;
+
+	bitbang = spi_master_get_devdata(spi->master);
+	if (bitbang->set_line_direction) {
+		int err;
 
+		err = bitbang->set_line_direction(spi, !!(t->tx_buf));
+		if (err < 0)
+			return err;
+	}
+
+	if (spi->mode & SPI_3WIRE) {
+		unsigned flags;
+
+		flags = t->tx_buf ? SPI_MASTER_NO_RX : SPI_MASTER_NO_TX;
+		return cs->txrx_bufs(spi, cs->txrx_word, nsecs, t, flags);
+	}
 	return cs->txrx_bufs(spi, cs->txrx_word, nsecs, t, 0);
 }
 
diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index be68298cbd9c..0626e6e3ea0c 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -121,7 +121,10 @@ static inline int getmiso(const struct spi_device *spi)
 {
 	struct spi_gpio *spi_gpio = spi_to_spi_gpio(spi);
 
-	return !!gpiod_get_value_cansleep(spi_gpio->miso);
+	if (spi->mode & SPI_3WIRE)
+		return !!gpiod_get_value_cansleep(spi_gpio->mosi);
+	else
+		return !!gpiod_get_value_cansleep(spi_gpio->miso);
 }
 
 /*
@@ -250,6 +253,16 @@ static int spi_gpio_setup(struct spi_device *spi)
 	return status;
 }
 
+static int spi_gpio_set_direction(struct spi_device *spi, bool output)
+{
+	struct spi_gpio *spi_gpio = spi_to_spi_gpio(spi);
+
+	if (output)
+		return gpiod_direction_output(spi_gpio->mosi, 1);
+	else
+		return gpiod_direction_input(spi_gpio->mosi);
+}
+
 static void spi_gpio_cleanup(struct spi_device *spi)
 {
 	spi_bitbang_cleanup(spi);
@@ -395,6 +408,7 @@ static int spi_gpio_probe(struct platform_device *pdev)
 		return status;
 
 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
+	master->mode_bits = SPI_3WIRE | SPI_CPHA | SPI_CPOL;
 	master->flags = master_flags;
 	master->bus_num = pdev->id;
 	/* The master needs to think there is a chipselect even if not connected */
@@ -407,6 +421,7 @@ static int spi_gpio_probe(struct platform_device *pdev)
 
 	spi_gpio->bitbang.master = master;
 	spi_gpio->bitbang.chipselect = spi_gpio_chipselect;
+	spi_gpio->bitbang.set_line_direction = spi_gpio_set_direction;
 
 	if ((master_flags & (SPI_MASTER_NO_TX | SPI_MASTER_NO_RX)) == 0) {
 		spi_gpio->bitbang.txrx_word[SPI_MODE_0] = spi_gpio_txrx_word_mode0;
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index 5847f00ef9cf..b7e021b274dc 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -31,6 +31,7 @@ struct spi_bitbang {
 	u32	(*txrx_word[4])(struct spi_device *spi,
 			unsigned nsecs,
 			u32 word, u8 bits, unsigned flags);
+	int	(*set_line_direction)(struct spi_device *spi, bool output);
 };
 
 /* you can call these default bitbang->master methods from your custom
-- 
cgit v1.2.3


From ba113c3aa79a7f941ac162d05a3620bdc985c58d Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:21 -0700
Subject: tcp: add data bytes sent stats

Introduce a new TCP stat to record the number of bytes sent
(RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 4 +++-
 net/ipv4/tcp.c           | 6 ++++++
 net/ipv4/tcp_output.c    | 1 +
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 58a8d7d71354..d0798dcd2cab 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -181,6 +181,9 @@ struct tcp_sock {
 	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
 				 * total number of data segments sent.
 				 */
+	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
+				 * total number of data bytes sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e3f6ed8a7064..1c70ed287c3b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -235,6 +235,8 @@ struct tcp_info {
 
 	__u32	tcpi_delivered;
 	__u32	tcpi_delivered_ce;
+
+	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -257,7 +259,7 @@ enum {
 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
-
+	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27bbe6a792b7..873cb9968ff5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2594,6 +2594,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	sk->sk_rx_dst = NULL;
 	tcp_saved_syn_free(tp);
 	tp->compressed_ack = 0;
+	tp->bytes_sent = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3201,6 +3202,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_delivery_rate = rate64;
 	info->tcpi_delivered = tp->delivered;
 	info->tcpi_delivered_ce = tp->delivered_ce;
+	info->tcpi_bytes_sent = tp->bytes_sent;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3225,6 +3227,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		0;
 }
 
@@ -3272,6 +3275,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
 
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+			  TCP_NLA_PAD);
+
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 490df62f26d4..861531fe0e97 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1136,6 +1136,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
 		tp->data_segs_out += tcp_skb_pcount(skb);
+		tp->bytes_sent += skb->len - tcp_header_size;
 		tcp_internal_pacing(sk, skb);
 	}
 
-- 
cgit v1.2.3


From fb31c9b9f6c85b1bad569ecedbde78d9e37cd87b Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:22 -0700
Subject: tcp: add data bytes retransmitted stats

Introduce a new TCP stat to record the number of bytes retransmitted
(RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 5 +++++
 net/ipv4/tcp_output.c    | 1 +
 4 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d0798dcd2cab..fb67f9a51b95 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -333,6 +333,9 @@ struct tcp_sock {
 				 * the first SYN. */
 	u32	undo_marker;	/* snd_una upon a new recovery episode. */
 	int	undo_retrans;	/* number of undoable retransmissions. */
+	u64	bytes_retrans;	/* RFC4898 tcpEStatsPerfOctetsRetrans
+				 * Total data bytes retransmitted
+				 */
 	u32	total_retrans;	/* Total retransmits for entire connection */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 1c70ed287c3b..c31f5100b744 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -237,6 +237,7 @@ struct tcp_info {
 	__u32	tcpi_delivered_ce;
 
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -260,6 +261,7 @@ enum {
 	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
+	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 873cb9968ff5..5ed1be88e922 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_saved_syn_free(tp);
 	tp->compressed_ack = 0;
 	tp->bytes_sent = 0;
+	tp->bytes_retrans = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3203,6 +3204,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_delivered = tp->delivered;
 	info->tcpi_delivered_ce = tp->delivered_ce;
 	info->tcpi_bytes_sent = tp->bytes_sent;
+	info->tcpi_bytes_retrans = tp->bytes_retrans;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3228,6 +3230,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
 		0;
 }
 
@@ -3277,6 +3280,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
 			  TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+			  TCP_NLA_PAD);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 861531fe0e97..50cabf7656f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2871,6 +2871,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 	tp->total_retrans += segs;
+	tp->bytes_retrans += skb->len;
 
 	/* make sure skb->data is aligned on arches that require it
 	 * and check if ack-trimming & collapsing extended the headroom
-- 
cgit v1.2.3


From 7e10b6554ff2ce7f86d5d3eec3af5db8db482caa Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:23 -0700
Subject: tcp: add dsack blocks received stats

Introduce a new TCP stat to record the number of DSACK blocks received
(RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info
(TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS).

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 3 +++
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 4 ++++
 net/ipv4/tcp_input.c     | 1 +
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fb67f9a51b95..da6281c549a5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -188,6 +188,9 @@ struct tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	u32	dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
+				 * total number of DSACK blocks received
+				 */
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c31f5100b744..0e1c0aec0153 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_info {
 
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
+	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -262,6 +263,7 @@ enum {
 	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
+	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5ed1be88e922..d6232b598cae 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2596,6 +2596,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->compressed_ack = 0;
 	tp->bytes_sent = 0;
 	tp->bytes_retrans = 0;
+	tp->dsack_dups = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3205,6 +3206,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_delivered_ce = tp->delivered_ce;
 	info->tcpi_bytes_sent = tp->bytes_sent;
 	info->tcpi_bytes_retrans = tp->bytes_retrans;
+	info->tcpi_dsack_dups = tp->dsack_dups;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3231,6 +3233,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
 		0;
 }
 
@@ -3282,6 +3285,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
 			  TCP_NLA_PAD);
+	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d51fa358b2b1..fbc85ff7d71d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -874,6 +874,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 {
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 	tp->rack.dsack_seen = 1;
+	tp->dsack_dups++;
 }
 
 /* It's reordering when higher sequence was delivered (i.e. sacked) before
-- 
cgit v1.2.3


From 7ec65372ca534217b53fd208500cf7aac223a383 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 31 Jul 2018 17:46:24 -0700
Subject: tcp: add stat of data packet reordering events

Introduce a new TCP stats to record the number of reordering events seen
and expose it in both tcp_info (TCP_INFO) and opt_stats
(SOF_TIMESTAMPING_OPT_STATS).
Application can use this stats to track the frequency of the reordering
events in addition to the existing reordering stats which tracks the
magnitude of the latest reordering event.

Note: this new stats tracks reordering events triggered by ACKs, which
could often be fewer than the actual number of packets being delivered
out-of-order.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 4 ++--
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 4 ++++
 net/ipv4/tcp_input.c     | 3 ++-
 net/ipv4/tcp_recovery.c  | 2 +-
 5 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index da6281c549a5..263e37271afd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -220,8 +220,7 @@ struct tcp_sock {
 #define TCP_RACK_RECOVERY_THRESH 16
 		u8 reo_wnd_persist:5, /* No. of recovery since last adj */
 		   dsack_seen:1, /* Whether DSACK seen after last adj */
-		   advanced:1,	 /* mstamp advanced since last lost marking */
-		   reord:1;	 /* reordering detected */
+		   advanced:1;	 /* mstamp advanced since last lost marking */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u8	compressed_ack;
@@ -267,6 +266,7 @@ struct tcp_sock {
 	u8	ecn_flags;	/* ECN status bits.			*/
 	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	reordering;	/* Packet reordering metric.		*/
+	u32	reord_seen;	/* number of data packet reordering events */
 	u32	snd_up;		/* Urgent pointer		*/
 
 /*
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 0e1c0aec0153..e02d31986ff9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -239,6 +239,7 @@ struct tcp_info {
 	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
 	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
 	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
+	__u32	tcpi_reord_seen;     /* reordering events seen */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -264,6 +265,7 @@ enum {
 	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
 	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
 	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
+	TCP_NLA_REORD_SEEN,	/* reordering events seen */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d6232b598cae..31fa1c080f28 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2597,6 +2597,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->bytes_sent = 0;
 	tp->bytes_retrans = 0;
 	tp->dsack_dups = 0;
+	tp->reord_seen = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
@@ -3207,6 +3208,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_bytes_sent = tp->bytes_sent;
 	info->tcpi_bytes_retrans = tp->bytes_retrans;
 	info->tcpi_dsack_dups = tp->dsack_dups;
+	info->tcpi_reord_seen = tp->reord_seen;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3234,6 +3236,7 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
 		0;
 }
 
@@ -3286,6 +3289,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
 			  TCP_NLA_PAD);
 	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fbc85ff7d71d..3d6156f07a8d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -906,8 +906,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
 				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
 	}
 
-	tp->rack.reord = 1;
 	/* This exciting event is worth to be remembered. 8) */
+	tp->reord_seen++;
 	NET_INC_STATS(sock_net(sk),
 		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
 }
@@ -1871,6 +1871,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 
 	tp->reordering = min_t(u32, tp->packets_out + addend,
 			       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+	tp->reord_seen++;
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
 }
 
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..c81aadff769b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (!tp->rack.reord) {
+	if (!tp->reord_seen) {
 		/* If reordering has not been observed, be aggressive during
 		 * the recovery or starting the recovery by DUPACK threshold.
 		 */
-- 
cgit v1.2.3


From c971e6a006175bd0f195c6346c4e8bc4089bec00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 28 May 2018 18:27:19 -0400
Subject: kill d_instantiate_no_diralias()

The only user is fuse_create_new_entry(), and there it's used to
mitigate the same mkdir/open-by-handle race as in nfs_mkdir().
The same solution applies - unhash the mkdir argument, then
call d_splice_alias() and if that returns a reference to preexisting
alias, dput() and report success.  ->mkdir() argument left unhashed
negative with the preexisting alias moved in the right place is just
fine from the ->mkdir() callers point of view.

Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 27 ---------------------------
 fs/fuse/dir.c          | 15 +++++++++++----
 include/linux/dcache.h |  1 -
 3 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0e8e5de3c48a..a7d9e7a4c283 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1899,33 +1899,6 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 }
 EXPORT_SYMBOL(d_instantiate_new);
 
-/**
- * d_instantiate_no_diralias - instantiate a non-aliased dentry
- * @entry: dentry to complete
- * @inode: inode to attach to this dentry
- *
- * Fill in inode information in the entry.  If a directory alias is found, then
- * return an error (and drop inode).  Together with d_materialise_unique() this
- * guarantees that a directory inode may never have more than one alias.
- */
-int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
-{
-	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-
-	security_d_instantiate(entry, inode);
-	spin_lock(&inode->i_lock);
-	if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
-		spin_unlock(&inode->i_lock);
-		iput(inode);
-		return -EBUSY;
-	}
-	__d_instantiate(entry, inode);
-	spin_unlock(&inode->i_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(d_instantiate_no_diralias);
-
 struct dentry *d_make_root(struct inode *root_inode)
 {
 	struct dentry *res = NULL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 56231b31f806..4bbae6ac75c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -539,6 +539,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
 {
 	struct fuse_entry_out outarg;
 	struct inode *inode;
+	struct dentry *d;
 	int err;
 	struct fuse_forget_link *forget;
 
@@ -570,11 +571,17 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
 	}
 	kfree(forget);
 
-	err = d_instantiate_no_diralias(entry, inode);
-	if (err)
-		return err;
+	d_drop(entry);
+	d = d_splice_alias(inode, entry);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
 
-	fuse_change_entry_timeout(entry, &outarg);
+	if (d) {
+		fuse_change_entry_timeout(d, &outarg);
+		dput(d);
+	} else {
+		fuse_change_entry_timeout(entry, &outarg);
+	}
 	fuse_invalidate_attr(dir);
 	return 0;
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 66c6e17e61e5..0b83629a3d8f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -227,7 +227,6 @@ extern void d_instantiate(struct dentry *, struct inode *);
 extern void d_instantiate_new(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
-extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-- 
cgit v1.2.3


From 06bcb5168d7d49aa3ed449ff13a6d13c30afc3f0 Mon Sep 17 00:00:00 2001
From: Frieder Schrempf <frieder.schrempf@exceet.de>
Date: Thu, 2 Aug 2018 14:53:52 +0200
Subject: spi: spi-mem: Fix a typo in the documentation of struct spi_mem

Fix a typo in the @drvpriv description.

Signed-off-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Acked-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi-mem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index bb4bd15ae1f6..951a2e949d5f 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -122,7 +122,7 @@ struct spi_mem_op {
 /**
  * struct spi_mem - describes a SPI memory device
  * @spi: the underlying SPI device
- * @drvpriv: spi_mem_drviver private data
+ * @drvpriv: spi_mem_driver private data
  *
  * Extra information that describe the SPI memory device and may be needed by
  * the controller to properly handle this device should be placed here.
-- 
cgit v1.2.3


From 5d27a9c8ea9e967d00b92a76d4bb242bf7692f2b Mon Sep 17 00:00:00 2001
From: Frieder Schrempf <frieder.schrempf@exceet.de>
Date: Thu, 2 Aug 2018 14:53:53 +0200
Subject: spi: spi-mem: Extend the SPI mem interface to set a custom memory
 name

When porting (Q)SPI controller drivers from the MTD layer to the SPI
layer, the naming scheme for the memory devices changes. To be able
to keep compatibility with the old drivers naming scheme, a name
field is added to struct spi_mem and a hook is added to let controller
drivers set a custom name for the memory device.

Example for the FSL QSPI driver:

Name with the old driver: 21e0000.qspi,
or with multiple devices: 21e0000.qspi-0, 21e0000.qspi-1, ...

Name with the new driver without spi_mem_get_name: spi4.0

Suggested-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Frieder Schrempf <frieder.schrempf@exceet.de>
Reviewed-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mem.c       | 28 ++++++++++++++++++++++++++++
 include/linux/spi/spi-mem.h | 12 ++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index 990770dfa5cf..e43842c7a31a 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -310,6 +310,24 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 }
 EXPORT_SYMBOL_GPL(spi_mem_exec_op);
 
+/**
+ * spi_mem_get_name() - Return the SPI mem device name to be used by the
+ *			upper layer if necessary
+ * @mem: the SPI memory
+ *
+ * This function allows SPI mem users to retrieve the SPI mem device name.
+ * It is useful if the upper layer needs to expose a custom name for
+ * compatibility reasons.
+ *
+ * Return: a string containing the name of the memory device to be used
+ *	   by the SPI mem user
+ */
+const char *spi_mem_get_name(struct spi_mem *mem)
+{
+	return mem->name;
+}
+EXPORT_SYMBOL_GPL(spi_mem_get_name);
+
 /**
  * spi_mem_adjust_op_size() - Adjust the data size of a SPI mem operation to
  *			      match controller limitations
@@ -344,6 +362,7 @@ static inline struct spi_mem_driver *to_spi_mem_drv(struct device_driver *drv)
 static int spi_mem_probe(struct spi_device *spi)
 {
 	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
+	struct spi_controller *ctlr = spi->controller;
 	struct spi_mem *mem;
 
 	mem = devm_kzalloc(&spi->dev, sizeof(*mem), GFP_KERNEL);
@@ -351,6 +370,15 @@ static int spi_mem_probe(struct spi_device *spi)
 		return -ENOMEM;
 
 	mem->spi = spi;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->get_name)
+		mem->name = ctlr->mem_ops->get_name(mem);
+	else
+		mem->name = dev_name(&spi->dev);
+
+	if (IS_ERR_OR_NULL(mem->name))
+		return PTR_ERR(mem->name);
+
 	spi_set_drvdata(spi, mem);
 
 	return memdrv->probe(mem);
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 951a2e949d5f..0d64ccc4e584 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -123,6 +123,7 @@ struct spi_mem_op {
  * struct spi_mem - describes a SPI memory device
  * @spi: the underlying SPI device
  * @drvpriv: spi_mem_driver private data
+ * @name: name of the SPI memory device
  *
  * Extra information that describe the SPI memory device and may be needed by
  * the controller to properly handle this device should be placed here.
@@ -133,6 +134,7 @@ struct spi_mem_op {
 struct spi_mem {
 	struct spi_device *spi;
 	void *drvpriv;
+	char *name;
 };
 
 /**
@@ -165,6 +167,13 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
  *		    limitations)
  * @supports_op: check if an operation is supported by the controller
  * @exec_op: execute a SPI memory operation
+ * @get_name: get a custom name for the SPI mem device from the controller.
+ *	      This might be needed if the controller driver has been ported
+ *	      to use the SPI mem layer and a custom name is used to keep
+ *	      mtdparts compatible.
+ *	      Note that if the implementation of this function allocates memory
+ *	      dynamically, then it should do so with devm_xxx(), as we don't
+ *	      have a ->free_name() function.
  *
  * This interface should be implemented by SPI controllers providing an
  * high-level interface to execute SPI memory operation, which is usually the
@@ -176,6 +185,7 @@ struct spi_controller_mem_ops {
 			    const struct spi_mem_op *op);
 	int (*exec_op)(struct spi_mem *mem,
 		       const struct spi_mem_op *op);
+	const char *(*get_name)(struct spi_mem *mem);
 };
 
 /**
@@ -234,6 +244,8 @@ bool spi_mem_supports_op(struct spi_mem *mem,
 int spi_mem_exec_op(struct spi_mem *mem,
 		    const struct spi_mem_op *op);
 
+const char *spi_mem_get_name(struct spi_mem *mem);
+
 int spi_mem_driver_register_with_owner(struct spi_mem_driver *drv,
 				       struct module *owner);
 
-- 
cgit v1.2.3


From a83c0ea79d8b5705d09a39c73224332f9170a903 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Sat, 30 Jun 2018 17:54:59 +0300
Subject: docs/mm: bootmem: add kernel-doc description of 'struct bootmem_data'

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/bootmem.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7942a96b1a9d..42515195d7d8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -27,9 +27,20 @@ extern unsigned long max_pfn;
 extern unsigned long long max_possible_pfn;
 
 #ifndef CONFIG_NO_BOOTMEM
-/*
- * node_bootmem_map is a map pointer - the bits represent all physical 
- * memory pages (including holes) on the node.
+/**
+ * struct bootmem_data - per-node information used by the bootmem allocator
+ * @node_min_pfn: the starting physical address of the node's memory
+ * @node_low_pfn: the end physical address of the directly addressable memory
+ * @node_bootmem_map: is a bitmap pointer - the bits represent all physical
+ *		      memory pages (including holes) on the node.
+ * @last_end_off: the offset within the page of the end of the last allocation;
+ *                if 0, the page used is full
+ * @hint_idx: the PFN of the page used with the last allocation;
+ *            together with using this with the @last_end_offset field,
+ *            a test can be made to see if allocations can be merged
+ *            with the page used for the last allocation rather than
+ *            using up a full new page.
+ * @list: list entry in the linked list ordered by the memory addresses
  */
 typedef struct bootmem_data {
 	unsigned long node_min_pfn;
-- 
cgit v1.2.3


From e1720fee27246dfb84f7c433e35367170a2d4436 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Sat, 30 Jun 2018 17:55:01 +0300
Subject: mm/memblock: add a name for memblock flags enumeration

Since kernel-doc does not like anonymous enums the name is required for
adding documentation. While on it, I've also updated all the function
declarations to use 'enum memblock_flags' instead of unsigned long.

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/memblock.h | 22 +++++++++++-----------
 mm/memblock.c            | 37 +++++++++++++++++++++----------------
 mm/nobootmem.c           |  2 +-
 3 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ca59883c8364..8b8fbceffd4d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,7 @@
 #define INIT_PHYSMEM_REGIONS	4
 
 /* Definition of memblock flags. */
-enum {
+enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
@@ -31,7 +31,7 @@ enum {
 struct memblock_region {
 	phys_addr_t base;
 	phys_addr_t size;
-	unsigned long flags;
+	enum memblock_flags flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	int nid;
 #endif
@@ -72,7 +72,7 @@ void memblock_discard(void);
 
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
 					phys_addr_t start, phys_addr_t end,
-					int nid, ulong flags);
+					int nid, enum memblock_flags flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
@@ -89,19 +89,19 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
-ulong choose_memblock_flags(void);
+enum memblock_flags choose_memblock_flags(void);
 
 /* Low level functions */
 int memblock_add_range(struct memblock_type *type,
 		       phys_addr_t base, phys_addr_t size,
-		       int nid, unsigned long flags);
+		       int nid, enum memblock_flags flags);
 
-void __next_mem_range(u64 *idx, int nid, ulong flags,
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
 		      struct memblock_type *type_a,
 		      struct memblock_type *type_b, phys_addr_t *out_start,
 		      phys_addr_t *out_end, int *out_nid);
 
-void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
 			  struct memblock_type *type_a,
 			  struct memblock_type *type_b, phys_addr_t *out_start,
 			  phys_addr_t *out_end, int *out_nid);
@@ -253,13 +253,13 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			   NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
 
 static inline void memblock_set_region_flags(struct memblock_region *r,
-					     unsigned long flags)
+					     enum memblock_flags flags)
 {
 	r->flags |= flags;
 }
 
 static inline void memblock_clear_region_flags(struct memblock_region *r,
-					       unsigned long flags)
+					       enum memblock_flags flags)
 {
 	r->flags &= ~flags;
 }
@@ -317,10 +317,10 @@ static inline bool memblock_bottom_up(void)
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
 					phys_addr_t start, phys_addr_t end,
-					ulong flags);
+					enum memblock_flags flags);
 phys_addr_t memblock_alloc_base_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t max_addr,
-					int nid, ulong flags);
+					int nid, enum memblock_flags flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/mm/memblock.c b/mm/memblock.c
index 03d48d8835ba..20f48b49821a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -60,7 +60,7 @@ static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
-ulong __init_memblock choose_memblock_flags(void)
+enum memblock_flags __init_memblock choose_memblock_flags(void)
 {
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
@@ -109,7 +109,7 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
 static phys_addr_t __init_memblock
 __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 				phys_addr_t size, phys_addr_t align, int nid,
-				ulong flags)
+				enum memblock_flags flags)
 {
 	phys_addr_t this_start, this_end, cand;
 	u64 i;
@@ -143,7 +143,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 static phys_addr_t __init_memblock
 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 			       phys_addr_t size, phys_addr_t align, int nid,
-			       ulong flags)
+			       enum memblock_flags flags)
 {
 	phys_addr_t this_start, this_end, cand;
 	u64 i;
@@ -188,7 +188,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
-					phys_addr_t end, int nid, ulong flags)
+					phys_addr_t end, int nid,
+					enum memblock_flags flags)
 {
 	phys_addr_t kernel_end, ret;
 
@@ -251,7 +252,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 					phys_addr_t align)
 {
 	phys_addr_t ret;
-	ulong flags = choose_memblock_flags();
+	enum memblock_flags flags = choose_memblock_flags();
 
 again:
 	ret = memblock_find_in_range_node(size, align, start, end,
@@ -472,7 +473,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
 						   int idx, phys_addr_t base,
 						   phys_addr_t size,
-						   int nid, unsigned long flags)
+						   int nid,
+						   enum memblock_flags flags)
 {
 	struct memblock_region *rgn = &type->regions[idx];
 
@@ -504,7 +506,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
  */
 int __init_memblock memblock_add_range(struct memblock_type *type,
 				phys_addr_t base, phys_addr_t size,
-				int nid, unsigned long flags)
+				int nid, enum memblock_flags flags)
 {
 	bool insert = false;
 	phys_addr_t obase = base;
@@ -873,7 +875,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
  * As both region arrays are sorted, the function advances the two indices
  * in lockstep and returns each intersection.
  */
-void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range(u64 *idx, int nid,
+				      enum memblock_flags flags,
 				      struct memblock_type *type_a,
 				      struct memblock_type *type_b,
 				      phys_addr_t *out_start,
@@ -982,7 +985,8 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
  *
  * Reverse of __next_mem_range().
  */
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+					  enum memblock_flags flags,
 					  struct memblock_type *type_a,
 					  struct memblock_type *type_b,
 					  phys_addr_t *out_start,
@@ -1140,7 +1144,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 
 static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
-					phys_addr_t end, int nid, ulong flags)
+					phys_addr_t end, int nid,
+					enum memblock_flags flags)
 {
 	phys_addr_t found;
 
@@ -1162,7 +1167,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
 					phys_addr_t start, phys_addr_t end,
-					ulong flags)
+					enum memblock_flags flags)
 {
 	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
 					flags);
@@ -1170,14 +1175,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
 
 phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t max_addr,
-					int nid, ulong flags)
+					int nid, enum memblock_flags flags)
 {
 	return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
 }
 
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-	ulong flags = choose_memblock_flags();
+	enum memblock_flags flags = choose_memblock_flags();
 	phys_addr_t ret;
 
 again:
@@ -1258,7 +1263,7 @@ static void * __init memblock_virt_alloc_internal(
 {
 	phys_addr_t alloc;
 	void *ptr;
-	ulong flags = choose_memblock_flags();
+	enum memblock_flags flags = choose_memblock_flags();
 
 	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
 		nid = NUMA_NO_NODE;
@@ -1733,7 +1738,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
 static void __init_memblock memblock_dump(struct memblock_type *type)
 {
 	phys_addr_t base, end, size;
-	unsigned long flags;
+	enum memblock_flags flags;
 	int idx;
 	struct memblock_region *rgn;
 
@@ -1751,7 +1756,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
 			snprintf(nid_buf, sizeof(nid_buf), " on node %d",
 				 memblock_get_region_node(rgn));
 #endif
-		pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
+		pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
 			type->name, idx, &base, &end, &size, nid_buf, flags);
 	}
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index c2cfa04f0231..439af3b765a7 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 {
 	void *ptr;
 	u64 addr;
-	ulong flags = choose_memblock_flags();
+	enum memblock_flags flags = choose_memblock_flags();
 
 	if (limit > memblock.current_limit)
 		limit = memblock.current_limit;
-- 
cgit v1.2.3


From 47cec4432ab06c4ba90c241b142a016f878ac6da Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Sat, 30 Jun 2018 17:55:02 +0300
Subject: docs/mm: memblock: update kernel-doc comments

* make memblock_discard description kernel-doc compatible
* add brief description for memblock_setclr_flag and describe its
  parameters
* fixup return value descriptions

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/memblock.h | 17 +++++++---
 mm/memblock.c            | 84 +++++++++++++++++++++++++++---------------------
 2 files changed, 59 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8b8fbceffd4d..63704c64583a 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -239,7 +239,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 /**
  * for_each_resv_unavail_range - iterate through reserved and unavailable memory
  * @i: u64 used as loop variable
- * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  *
@@ -367,8 +366,10 @@ phys_addr_t memblock_get_current_limit(void);
  */
 
 /**
- * memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region
+ * memblock_region_memory_base_pfn - get the lowest pfn of the memory region
  * @reg: memblock_region structure
+ *
+ * Return: the lowest pfn intersecting with the memory region
  */
 static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
 {
@@ -376,8 +377,10 @@ static inline unsigned long memblock_region_memory_base_pfn(const struct membloc
 }
 
 /**
- * memblock_region_memory_end_pfn - Return the end_pfn this region
+ * memblock_region_memory_end_pfn - get the end pfn of the memory region
  * @reg: memblock_region structure
+ *
+ * Return: the end_pfn of the reserved region
  */
 static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
 {
@@ -385,8 +388,10 @@ static inline unsigned long memblock_region_memory_end_pfn(const struct memblock
 }
 
 /**
- * memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region
+ * memblock_region_reserved_base_pfn - get the lowest pfn of the reserved region
  * @reg: memblock_region structure
+ *
+ * Return: the lowest pfn intersecting with the reserved region
  */
 static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
 {
@@ -394,8 +399,10 @@ static inline unsigned long memblock_region_reserved_base_pfn(const struct membl
 }
 
 /**
- * memblock_region_reserved_end_pfn - Return the end_pfn this region
+ * memblock_region_reserved_end_pfn - get the end pfn of the reserved region
  * @reg: memblock_region structure
+ *
+ * Return: the end_pfn of the reserved region
  */
 static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index 20f48b49821a..4b9dd76f6ae6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,10 +92,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
 	return i < type->cnt;
 }
 
-/*
+/**
  * __memblock_find_range_bottom_up - find free area utility in bottom-up
  * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ *       %MEMBLOCK_ALLOC_ACCESSIBLE
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -103,7 +104,7 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
  *
  * Utility called from memblock_find_in_range_node(), find free area bottom-up.
  *
- * RETURNS:
+ * Return:
  * Found address on success, 0 on failure.
  */
 static phys_addr_t __init_memblock
@@ -129,7 +130,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
 /**
  * __memblock_find_range_top_down - find free area utility, in top-down
  * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ *       %MEMBLOCK_ALLOC_ACCESSIBLE
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -137,7 +139,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  *
  * Utility called from memblock_find_in_range_node(), find free area top-down.
  *
- * RETURNS:
+ * Return:
  * Found address on success, 0 on failure.
  */
 static phys_addr_t __init_memblock
@@ -169,7 +171,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ *       %MEMBLOCK_ALLOC_ACCESSIBLE
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  * @flags: pick from blocks based on memory attributes
  *
@@ -183,7 +186,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  *
  * If bottom-up allocation failed, will try to allocate memory top-down.
  *
- * RETURNS:
+ * Return:
  * Found address on success, 0 on failure.
  */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
@@ -238,13 +241,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 /**
  * memblock_find_in_range - find free area in given range
  * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ *       %MEMBLOCK_ALLOC_ACCESSIBLE
  * @size: size of free area to find
  * @align: alignment of free area to find
  *
  * Find @size free area aligned to @align in the specified range.
  *
- * RETURNS:
+ * Return:
  * Found address on success, 0 on failure.
  */
 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
@@ -288,7 +292,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 /**
- * Discard memory and reserved arrays if they were allocated
+ * memblock_discard - discard memory and reserved arrays if they were allocated
  */
 void __init memblock_discard(void)
 {
@@ -318,11 +322,11 @@ void __init memblock_discard(void)
  *
  * Double the size of the @type regions array. If memblock is being used to
  * allocate memory for a new reserved regions array and there is a previously
- * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * allocated memory range [@new_area_start, @new_area_start + @new_area_size]
  * waiting to be reserved, ensure the memory used by the new array does
  * not overlap.
  *
- * RETURNS:
+ * Return:
  * 0 on success, -1 on failure.
  */
 static int __init_memblock memblock_double_array(struct memblock_type *type,
@@ -467,7 +471,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
  * @nid:	node id of the new region
  * @flags:	flags of the new region
  *
- * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * Insert new memblock region [@base, @base + @size) into @type at @idx.
  * @type must already have extra room to accommodate the new region.
  */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
@@ -496,12 +500,12 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
  * @nid: nid of the new region
  * @flags: flags of the new region
  *
- * Add new memblock region [@base,@base+@size) into @type.  The new region
+ * Add new memblock region [@base, @base + @size) into @type.  The new region
  * is allowed to overlap with existing ones - overlaps don't affect already
  * existing regions.  @type is guaranteed to be minimal (all neighbouring
  * compatible regions are merged) after the addition.
  *
- * RETURNS:
+ * Return:
  * 0 on success, -errno on failure.
  */
 int __init_memblock memblock_add_range(struct memblock_type *type,
@@ -615,11 +619,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
  * @end_rgn: out parameter for the end of isolated region
  *
  * Walk @type and ensure that regions don't cross the boundaries defined by
- * [@base,@base+@size).  Crossing regions are split at the boundaries,
+ * [@base, @base + @size).  Crossing regions are split at the boundaries,
  * which may create at most two more regions.  The index of the first
  * region inside the range is returned in *@start_rgn and end in *@end_rgn.
  *
- * RETURNS:
+ * Return:
  * 0 on success, -errno on failure.
  */
 static int __init_memblock memblock_isolate_range(struct memblock_type *type,
@@ -730,10 +734,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 }
 
 /**
+ * memblock_setclr_flag - set or clear flag for a memory region
+ * @base: base address of the region
+ * @size: size of the region
+ * @set: set or clear the flag
+ * @flag: the flag to udpate
  *
  * This function isolates region [@base, @base + @size), and sets/clears flag
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 static int __init_memblock memblock_setclr_flag(phys_addr_t base,
 				phys_addr_t size, int set, int flag)
@@ -760,7 +769,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -772,7 +781,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -784,7 +793,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
 {
@@ -798,7 +807,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
 {
@@ -810,7 +819,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
 {
@@ -971,9 +980,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
 /**
  * __next_mem_range_rev - generic next function for for_each_*_range_rev()
  *
- * Finds the next range from type_a which is not marked as unsuitable
- * in type_b.
- *
  * @idx: pointer to u64 loop variable
  * @nid: node selector, %NUMA_NO_NODE for all nodes
  * @flags: pick from blocks based on memory attributes
@@ -983,6 +989,9 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
  * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @out_nid: ptr to int for nid of the range, can be %NULL
  *
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
  * Reverse of __next_mem_range().
  */
 void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
@@ -1118,10 +1127,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
  * @type: memblock type to set node ID for
  * @nid: node ID to set
  *
- * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
  * Regions which cross the area boundaries are split as necessary.
  *
- * RETURNS:
+ * Return:
  * 0 on success, -errno on failure.
  */
 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
@@ -1245,7 +1254,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
  * The allocation is performed from memory region limited by
  * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
  *
- * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0.
  *
  * The phys address of allocated boot memory block is converted to virtual and
  * allocated memory is reset to 0.
@@ -1253,7 +1262,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
  * In addition, function sets the min_count to 0 using kmemleak_alloc for
  * allocated boot memory block, so that it is never reported as leaks.
  *
- * RETURNS:
+ * Return:
  * Virtual address of allocated memory block on success, NULL on failure.
  */
 static void * __init memblock_virt_alloc_internal(
@@ -1338,7 +1347,7 @@ done:
  * info), if enabled. Does not zero allocated memory, does not panic if request
  * cannot be satisfied.
  *
- * RETURNS:
+ * Return:
  * Virtual address of allocated memory block on success, NULL on failure.
  */
 void * __init memblock_virt_alloc_try_nid_raw(
@@ -1375,7 +1384,7 @@ void * __init memblock_virt_alloc_try_nid_raw(
  * Public function, provides additional debug information (including caller
  * info), if enabled. This function zeroes the allocated memory.
  *
- * RETURNS:
+ * Return:
  * Virtual address of allocated memory block on success, NULL on failure.
  */
 void * __init memblock_virt_alloc_try_nid_nopanic(
@@ -1411,7 +1420,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
  * which provides debug information (including caller info), if enabled,
  * and panics if the request can not be satisfied.
  *
- * RETURNS:
+ * Return:
  * Virtual address of allocated memory block on success, NULL on failure.
  */
 void * __init memblock_virt_alloc_try_nid(
@@ -1668,9 +1677,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
  * @base: base of region to check
  * @size: size of region to check
  *
- * Check if the region [@base, @base+@size) is a subset of a memory block.
+ * Check if the region [@base, @base + @size) is a subset of a memory block.
  *
- * RETURNS:
+ * Return:
  * 0 if false, non-zero if true
  */
 bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
@@ -1689,9 +1698,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz
  * @base: base of region to check
  * @size: size of region to check
  *
- * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ * Check if the region [@base, @base + @size) intersects a reserved
+ * memory block.
  *
- * RETURNS:
+ * Return:
  * True if they intersect, false if not.
  */
 bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
-- 
cgit v1.2.3


From 9a0de1bfe191220cb0437865261ab2f95cbb13ef Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Sat, 30 Jun 2018 17:55:04 +0300
Subject: docs/mm: memblock: add kernel-doc description for memblock types

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/memblock.h | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 63704c64583a..516920549378 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -20,7 +20,13 @@
 #define INIT_MEMBLOCK_REGIONS	128
 #define INIT_PHYSMEM_REGIONS	4
 
-/* Definition of memblock flags. */
+/**
+ * enum memblock_flags - definition of memory region attributes
+ * @MEMBLOCK_NONE: no special request
+ * @MEMBLOCK_HOTPLUG: hotpluggable region
+ * @MEMBLOCK_MIRROR: mirrored region
+ * @MEMBLOCK_NOMAP: don't add to kernel direct mapping
+ */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
@@ -28,6 +34,13 @@ enum memblock_flags {
 	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
 };
 
+/**
+ * struct memblock_region - represents a memory region
+ * @base: physical address of the region
+ * @size: size of the region
+ * @flags: memory region attributes
+ * @nid: NUMA node id
+ */
 struct memblock_region {
 	phys_addr_t base;
 	phys_addr_t size;
@@ -37,14 +50,30 @@ struct memblock_region {
 #endif
 };
 
+/**
+ * struct memblock_type - collection of memory regions of certain type
+ * @cnt: number of regions
+ * @max: size of the allocated array
+ * @total_size: size of all regions
+ * @regions: array of regions
+ * @name: the memory type symbolic name
+ */
 struct memblock_type {
-	unsigned long cnt;	/* number of regions */
-	unsigned long max;	/* size of the allocated array */
-	phys_addr_t total_size;	/* size of all regions */
+	unsigned long cnt;
+	unsigned long max;
+	phys_addr_t total_size;
 	struct memblock_region *regions;
 	char *name;
 };
 
+/**
+ * struct memblock - memblock allocator metadata
+ * @bottom_up: is bottom up direction?
+ * @current_limit: physical address of the current allocation limit
+ * @memory: usabe memory regions
+ * @reserved: reserved memory regions
+ * @physmem: all physical memory
+ */
 struct memblock {
 	bool bottom_up;  /* is bottom up direction? */
 	phys_addr_t current_limit;
-- 
cgit v1.2.3


From 2afc9166f79b8f6da5f347f48515215ceee4ae37 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 2 Aug 2018 10:51:40 -0700
Subject: scsi: sysfs: Introduce sysfs_{un,}break_active_protection()

Introduce these two functions and export them such that the next patch
can add calls to these functions from the SCSI core.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 fs/sysfs/file.c       | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 14 ++++++++++++++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5c13f29bfcdb..118fa197a35f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -405,6 +405,50 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
+/**
+ * sysfs_break_active_protection - break "active" protection
+ * @kobj: The kernel object @attr is associated with.
+ * @attr: The attribute to break the "active" protection for.
+ *
+ * With sysfs, just like kernfs, deletion of an attribute is postponed until
+ * all active .show() and .store() callbacks have finished unless this function
+ * is called. Hence this function is useful in methods that implement self
+ * deletion.
+ */
+struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
+						  const struct attribute *attr)
+{
+	struct kernfs_node *kn;
+
+	kobject_get(kobj);
+	kn = kernfs_find_and_get(kobj->sd, attr->name);
+	if (kn)
+		kernfs_break_active_protection(kn);
+	return kn;
+}
+EXPORT_SYMBOL_GPL(sysfs_break_active_protection);
+
+/**
+ * sysfs_unbreak_active_protection - restore "active" protection
+ * @kn: Pointer returned by sysfs_break_active_protection().
+ *
+ * Undo the effects of sysfs_break_active_protection(). Since this function
+ * calls kernfs_put() on the kernfs node that corresponds to the 'attr'
+ * argument passed to sysfs_break_active_protection() that attribute may have
+ * been removed between the sysfs_break_active_protection() and
+ * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after
+ * this function has returned.
+ */
+void sysfs_unbreak_active_protection(struct kernfs_node *kn)
+{
+	struct kobject *kobj = kn->parent->priv;
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_put(kn);
+	kobject_put(kobj);
+}
+EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection);
+
 /**
  * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
  * @kobj: object we're acting for
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b8bfdc173ec0..3c12198c0103 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -237,6 +237,9 @@ int __must_check sysfs_create_files(struct kobject *kobj,
 				   const struct attribute **attr);
 int __must_check sysfs_chmod_file(struct kobject *kobj,
 				  const struct attribute *attr, umode_t mode);
+struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
+						  const struct attribute *attr);
+void sysfs_unbreak_active_protection(struct kernfs_node *kn);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns);
 bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
@@ -350,6 +353,17 @@ static inline int sysfs_chmod_file(struct kobject *kobj,
 	return 0;
 }
 
+static inline struct kernfs_node *
+sysfs_break_active_protection(struct kobject *kobj,
+			      const struct attribute *attr)
+{
+	return NULL;
+}
+
+static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn)
+{
+}
+
 static inline void sysfs_remove_file_ns(struct kobject *kobj,
 					const struct attribute *attr,
 					const void *ns)
-- 
cgit v1.2.3


From e7d0748dd71695b94f3a35c8bdc05226a7f3d919 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 2 Aug 2018 15:22:13 -0600
Subject: block: Switch struct packet_command to use struct scsi_sense_hdr

There is a lot of needless struct request_sense usage in the CDROM
code. These can all be struct scsi_sense_hdr instead, to avoid any
confusion over their respective structure sizes. This patch is a lot
of noise changing "sense" to "sshdr", but the final code is more
readable to distinguish between "sense" meaning "struct request_sense"
and "sshdr" meaning "struct scsi_sense_hdr".

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/pktcdvd.c    | 36 ++++++++++++++++++------------------
 drivers/cdrom/cdrom.c      | 22 +++++++++++-----------
 drivers/ide/ide-cd.c       | 11 ++++++-----
 drivers/ide/ide-cd.h       |  4 ++--
 drivers/ide/ide-cd_ioctl.c | 30 +++++++++++++++---------------
 drivers/scsi/sr_ioctl.c    | 22 +++++++++-------------
 include/linux/cdrom.h      |  3 ++-
 7 files changed, 63 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9bb7721c26fc..e285413d4a75 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index)
 static void pkt_dump_sense(struct pktcdvd_device *pd,
 			   struct packet_command *cgc)
 {
-	struct request_sense *sense = cgc->sense;
+	struct scsi_sense_hdr *sshdr = cgc->sshdr;
 
-	if (sense)
+	if (sshdr)
 		pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
 			CDROM_PACKET_SIZE, cgc->cmd,
-			sense->sense_key, sense->asc, sense->ascq,
-			sense_key_string(sense->sense_key));
+			sshdr->sense_key, sshdr->asc, sshdr->ascq,
+			sense_key_string(sshdr->sense_key));
 	else
 		pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
 }
@@ -787,11 +787,11 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
 				unsigned write_speed, unsigned read_speed)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	int ret;
 
 	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	cgc.cmd[0] = GPCMD_SET_SPEED;
 	cgc.cmd[2] = (read_speed >> 8) & 0xff;
 	cgc.cmd[3] = read_speed & 0xff;
@@ -1651,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
 static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	write_param_page *wp;
 	char buffer[128];
 	int ret, size;
@@ -1662,7 +1662,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 
 	memset(buffer, 0, sizeof(buffer));
 	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
 	if (ret) {
 		pkt_dump_sense(pd, &cgc);
@@ -1678,7 +1678,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 	 * now get it all
 	 */
 	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
 	if (ret) {
 		pkt_dump_sense(pd, &cgc);
@@ -1916,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 						int set)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	unsigned char buf[64];
 	int ret;
 
 	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	cgc.buflen = pd->mode_offset + 12;
 
 	/*
@@ -1962,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
 						unsigned *write_speed)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	unsigned char buf[256+18];
 	unsigned char *cap_buf;
 	int ret, offset;
 
 	cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
 	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 
 	ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
 	if (ret) {
@@ -2023,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 						unsigned *speed)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	unsigned char buf[64];
 	unsigned int size, st, sp;
 	int ret;
 
 	init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
 	cgc.cmd[1] = 2;
 	cgc.cmd[2] = 4; /* READ ATIP */
@@ -2044,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 		size = sizeof(buf);
 
 	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
 	cgc.cmd[1] = 2;
 	cgc.cmd[2] = 4;
@@ -2095,13 +2095,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 {
 	struct packet_command cgc;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	int ret;
 
 	pkt_dbg(2, pd, "Performing OPC\n");
 
 	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.sense = &sense;
+	cgc.sshdr = &sshdr;
 	cgc.timeout = 60*HZ;
 	cgc.cmd[0] = GPCMD_SEND_OPC;
 	cgc.cmd[1] = 1;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index a78b8e7085e9..86619472d916 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -345,10 +345,10 @@ static LIST_HEAD(cdrom_list);
 int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
 			       struct packet_command *cgc)
 {
-	if (cgc->sense) {
-		cgc->sense->sense_key = 0x05;
-		cgc->sense->asc = 0x20;
-		cgc->sense->ascq = 0x00;
+	if (cgc->sshdr) {
+		cgc->sshdr->sense_key = 0x05;
+		cgc->sshdr->asc = 0x20;
+		cgc->sshdr->ascq = 0x00;
 	}
 
 	cgc->stat = -EIO;
@@ -2943,7 +2943,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 					      struct packet_command *cgc,
 					      int cmd)
 {
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	struct cdrom_msf msf;
 	int blocksize = 0, format = 0, lba;
 	int ret;
@@ -2971,13 +2971,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 	if (cgc->buffer == NULL)
 		return -ENOMEM;
 
-	memset(&sense, 0, sizeof(sense));
-	cgc->sense = &sense;
+	memset(&sshdr, 0, sizeof(sshdr));
+	cgc->sshdr = &sshdr;
 	cgc->data_direction = CGC_DATA_READ;
 	ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
-	if (ret && sense.sense_key == 0x05 &&
-	    sense.asc == 0x20 &&
-	    sense.ascq == 0x00) {
+	if (ret && sshdr.sense_key == 0x05 &&
+	    sshdr.asc == 0x20 &&
+	    sshdr.ascq == 0x00) {
 		/*
 		 * SCSI-II devices are not required to support
 		 * READ_CD, so let's try switching block size
@@ -2986,7 +2986,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 		ret = cdrom_switch_blocksize(cdi, blocksize);
 		if (ret)
 			goto out;
-		cgc->sense = NULL;
+		cgc->sshdr = NULL;
 		ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
 		ret |= cdrom_switch_blocksize(cdi, blocksize);
 	}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index a37dd381d307..a24cdff01865 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -419,7 +419,7 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		    int write, void *buffer, unsigned *bufflen,
-		    struct request_sense *sense, int timeout,
+		    struct scsi_sense_hdr *sshdr, int timeout,
 		    req_flags_t rq_flags)
 {
 	struct cdrom_info *info = drive->driver_data;
@@ -456,8 +456,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 
 		if (buffer)
 			*bufflen = scsi_req(rq)->resid_len;
-		if (sense)
-			memcpy(sense, scsi_req(rq)->sense, sizeof(*sense));
+		if (sshdr)
+			scsi_normalize_sense(scsi_req(rq)->sense,
+					     scsi_req(rq)->sense_len, sshdr);
 
 		/*
 		 * FIXME: we should probably abort/retry or something in case of
@@ -864,7 +865,7 @@ static void msf_from_bcd(struct atapi_msf *msf)
 	msf->frame  = bcd2bin(msf->frame);
 }
 
-int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
+int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
 {
 	struct cdrom_info *info = drive->driver_data;
 	struct cdrom_device_info *cdi;
@@ -886,7 +887,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
 	 */
 	cmd[7] = cdi->sanyo_slot % 3;
 
-	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET);
+	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
 }
 
 static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index fc162fbb6629..a69dc7f61c4d 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
 
 /* ide-cd.c functions used by ide-cd_ioctl.c */
 int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
-		    unsigned *, struct request_sense *, int, req_flags_t);
+		    unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
 int ide_cd_read_toc(ide_drive_t *);
 int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
 void ide_cdrom_update_speed(ide_drive_t *, u8 *);
-int cdrom_check_status(ide_drive_t *, struct request_sense *);
+int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
 
 /* ide-cd_ioctl.c */
 int ide_cdrom_open_real(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 14540544413c..4a6e1a413ead 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
 {
 	ide_drive_t *drive = cdi->handle;
 	struct media_event_desc med;
-	struct request_sense sense;
+	struct scsi_sense_hdr sshdr;
 	int stat;
 
 	if (slot_nr != CDSL_CURRENT)
 		return -EINVAL;
 
-	stat = cdrom_check_status(drive, &sense);
-	if (!stat || sense.sense_key == UNIT_ATTENTION)
+	stat = cdrom_check_status(drive, &sshdr);
+	if (!stat || sshdr.sense_key == UNIT_ATTENTION)
 		return CDS_DISC_OK;
 
 	if (!cdrom_get_media_event(cdi, &med)) {
@@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
 			return CDS_NO_DISC;
 	}
 
-	if (sense.sense_key == NOT_READY && sense.asc == 0x04
-			&& sense.ascq == 0x04)
+	if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
+			&& sshdr.ascq == 0x04)
 		return CDS_DISC_OK;
 
 	/*
@@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
 	 * just return TRAY_OPEN since ATAPI doesn't provide
 	 * any other way to detect this...
 	 */
-	if (sense.sense_key == NOT_READY) {
-		if (sense.asc == 0x3a && sense.ascq == 1)
+	if (sshdr.sense_key == NOT_READY) {
+		if (sshdr.asc == 0x3a && sshdr.ascq == 1)
 			return CDS_NO_DISC;
 		else
 			return CDS_TRAY_OPEN;
@@ -135,7 +135,7 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag)
 static
 int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
 {
-	struct request_sense my_sense, *sense = &my_sense;
+	struct scsi_sense_hdr sshdr;
 	int stat;
 
 	/* If the drive cannot lock the door, just pretend. */
@@ -150,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
 		cmd[4] = lockflag ? 1 : 0;
 
 		stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
-				       sense, 0, 0);
+				       &sshdr, 0, 0);
 	}
 
 	/* If we got an illegal field error, the drive
 	   probably cannot lock the door. */
 	if (stat != 0 &&
-	    sense->sense_key == ILLEGAL_REQUEST &&
-	    (sense->asc == 0x24 || sense->asc == 0x20)) {
+	    sshdr.sense_key == ILLEGAL_REQUEST &&
+	    (sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
 		printk(KERN_ERR "%s: door locking not supported\n",
 			drive->name);
 		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
@@ -165,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
 	}
 
 	/* no medium, that's alright. */
-	if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a)
+	if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
 		stat = 0;
 
 	if (stat == 0) {
@@ -451,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
 	   layer. the packet must be complete, as we do not
 	   touch it at all. */
 
-	if (cgc->sense)
-		memset(cgc->sense, 0, sizeof(struct request_sense));
+	if (cgc->sshdr)
+		memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
 
 	if (cgc->quiet)
 		flags |= RQF_QUIET;
@@ -460,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
 	cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
 				    cgc->data_direction == CGC_DATA_WRITE,
 				    cgc->buffer, &len,
-				    cgc->sense, cgc->timeout, flags);
+				    cgc->sshdr, cgc->timeout, flags);
 	if (!cgc->stat)
 		cgc->buflen -= len;
 	return cgc->stat;
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 35fab1e18adc..ffcf902da390 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi,
 int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 {
 	struct scsi_device *SDev;
-	struct scsi_sense_hdr sshdr;
+	struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr;
 	int result, err = 0, retries = 0;
-	unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL;
 
 	SDev = cd->device;
 
-	if (cgc->sense)
-		senseptr = sense_buffer;
+	if (cgc->sshdr)
+		sshdr = cgc->sshdr;
 
       retry:
 	if (!scsi_block_when_processing_errors(SDev)) {
@@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 	}
 
 	result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
-			      cgc->buffer, cgc->buflen, senseptr, &sshdr,
+			      cgc->buffer, cgc->buflen, NULL, sshdr,
 			      cgc->timeout, IOCTL_RETRIES, 0, 0, NULL);
 
-	if (cgc->sense)
-		memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
-
 	/* Minimal error checking.  Ignore cases we know about, and report the rest. */
 	if (driver_byte(result) != 0) {
-		switch (sshdr.sense_key) {
+		switch (sshdr->sense_key) {
 		case UNIT_ATTENTION:
 			SDev->changed = 1;
 			if (!cgc->quiet)
@@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 			err = -ENOMEDIUM;
 			break;
 		case NOT_READY:	/* This happens if there is no disc in drive */
-			if (sshdr.asc == 0x04 &&
-			    sshdr.ascq == 0x01) {
+			if (sshdr->asc == 0x04 &&
+			    sshdr->ascq == 0x01) {
 				/* sense: Logical unit is in process of becoming ready */
 				if (!cgc->quiet)
 					sr_printk(KERN_INFO, cd,
@@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
 			break;
 		case ILLEGAL_REQUEST:
 			err = -EIO;
-			if (sshdr.asc == 0x20 &&
-			    sshdr.ascq == 0x00)
+			if (sshdr->asc == 0x20 &&
+			    sshdr->ascq == 0x00)
 				/* sense: Invalid command operation code */
 				err = -EDRIVE_CANT_DO_THIS;
 			break;
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index e75dfd1f1dec..528271c60018 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -13,6 +13,7 @@
 
 #include <linux/fs.h>		/* not really needed, later.. */
 #include <linux/list.h>
+#include <scsi/scsi_common.h>
 #include <uapi/linux/cdrom.h>
 
 struct packet_command
@@ -21,7 +22,7 @@ struct packet_command
 	unsigned char 		*buffer;
 	unsigned int 		buflen;
 	int			stat;
-	struct request_sense	*sense;
+	struct scsi_sense_hdr	*sshdr;
 	unsigned char		data_direction;
 	int			quiet;
 	int			timeout;
-- 
cgit v1.2.3


From 9a47249d444d344051c7c0e909fad0e88515a5c2 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 31 Jul 2018 21:11:00 +0200
Subject: random: Make crng state queryable

It is very useful to be able to know whether or not get_random_bytes_wait
/ wait_for_random_bytes is going to block or not, or whether plain
get_random_bytes is going to return good randomness or bad randomness.

The particular use case is for mitigating certain attacks in WireGuard.
A handshake packet arrives and is queued up. Elsewhere a worker thread
takes items from the queue and processes them. In replying to these
items, it needs to use some random data, and it has to be good random
data. If we simply block until we can have good randomness, then it's
possible for an attacker to fill the queue up with packets waiting to be
processed. Upon realizing the queue is full, WireGuard will detect that
it's under a denial of service attack, and behave accordingly. A better
approach is just to drop incoming handshake packets if the crng is not
yet initialized.

This patch, therefore, makes that information directly accessible.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c  | 15 +++++++++++++++
 include/linux/random.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 561082d46a82..bf5f99fc36f1 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1663,6 +1663,21 @@ int wait_for_random_bytes(void)
 }
 EXPORT_SYMBOL(wait_for_random_bytes);
 
+/*
+ * Returns whether or not the urandom pool has been seeded and thus guaranteed
+ * to supply cryptographically secure random numbers. This applies to: the
+ * /dev/urandom device, the get_random_bytes function, and the get_random_{u32,
+ * ,u64,int,long} family of functions.
+ *
+ * Returns: true if the urandom pool has been seeded.
+ *          false if the urandom pool has not been seeded.
+ */
+bool rng_is_initialized(void)
+{
+	return crng_ready();
+}
+EXPORT_SYMBOL(rng_is_initialized);
+
 /*
  * Add a callback function that will be invoked when the nonblocking
  * pool is initialised.
diff --git a/include/linux/random.h b/include/linux/random.h
index f1c9bc5cd231..445a0ea4ff49 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -36,6 +36,7 @@ extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern int wait_for_random_bytes(void);
+extern bool rng_is_initialized(void);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern int __must_check get_random_bytes_arch(void *buf, int nbytes);
-- 
cgit v1.2.3


From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:17 -0700
Subject: bpf: add ability to charge bpf maps memory dynamically

This commits extends existing bpf maps memory charging API
to support dynamic charging/uncharging.

This is required to account memory used by maps,
if all entries are created dynamically after
the map initialization.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 58 ++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b5ad95cf339..5a4a256473c3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -435,6 +435,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
+int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
+void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
 void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7958252a4d29 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,32 +181,60 @@ int bpf_map_precharge_memlock(u32 pages)
 	return 0;
 }
 
-static int bpf_map_charge_memlock(struct bpf_map *map)
+static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 {
-	struct user_struct *user = get_current_user();
-	unsigned long memlock_limit;
+	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
+		atomic_long_sub(pages, &user->locked_vm);
+		return -EPERM;
+	}
+	return 0;
+}
 
-	atomic_long_add(map->pages, &user->locked_vm);
+static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
+{
+	atomic_long_sub(pages, &user->locked_vm);
+}
+
+static int bpf_map_init_memlock(struct bpf_map *map)
+{
+	struct user_struct *user = get_current_user();
+	int ret;
 
-	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
-		atomic_long_sub(map->pages, &user->locked_vm);
+	ret = bpf_charge_memlock(user, map->pages);
+	if (ret) {
 		free_uid(user);
-		return -EPERM;
+		return ret;
 	}
 	map->user = user;
-	return 0;
+	return ret;
 }
 
-static void bpf_map_uncharge_memlock(struct bpf_map *map)
+static void bpf_map_release_memlock(struct bpf_map *map)
 {
 	struct user_struct *user = map->user;
-
-	atomic_long_sub(map->pages, &user->locked_vm);
+	bpf_uncharge_memlock(user, map->pages);
 	free_uid(user);
 }
 
+int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
+{
+	int ret;
+
+	ret = bpf_charge_memlock(map->user, pages);
+	if (ret)
+		return ret;
+	map->pages += pages;
+	return ret;
+}
+
+void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
+{
+	bpf_uncharge_memlock(map->user, pages);
+	map->pages -= pages;
+}
+
 static int bpf_map_alloc_id(struct bpf_map *map)
 {
 	int id;
@@ -256,7 +284,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
-	bpf_map_uncharge_memlock(map);
+	bpf_map_release_memlock(map);
 	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
@@ -492,7 +520,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_nouncharge;
 
-	err = bpf_map_charge_memlock(map);
+	err = bpf_map_init_memlock(map);
 	if (err)
 		goto free_map_sec;
 
@@ -515,7 +543,7 @@ static int map_create(union bpf_attr *attr)
 	return err;
 
 free_map:
-	bpf_map_uncharge_memlock(map);
+	bpf_map_release_memlock(map);
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
-- 
cgit v1.2.3


From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:18 -0700
Subject: bpf: introduce cgroup storage maps

This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps:
a special type of maps which are implementing the cgroup storage.

>From the userspace point of view it's almost a generic
hash map with the (cgroup inode id, attachment type) pair
used as a key.

The only difference is that some operations are restricted:
  1) a user can't create new entries,
  2) a user can't remove existing entries.

The lookup from userspace is o(log(n)).

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  38 +++++
 include/linux/bpf.h        |   1 +
 include/linux/bpf_types.h  |   3 +
 include/uapi/linux/bpf.h   |   6 +
 kernel/bpf/Makefile        |   1 +
 kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       |   3 +
 kernel/bpf/verifier.c      |  12 ++
 8 files changed, 440 insertions(+)
 create mode 100644 kernel/bpf/local_storage.c

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d50c2f0a655a..7d00d58869ed 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -4,19 +4,39 @@
 
 #include <linux/errno.h>
 #include <linux/jump_label.h>
+#include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
 struct sock;
 struct sockaddr;
 struct cgroup;
 struct sk_buff;
+struct bpf_map;
+struct bpf_prog;
 struct bpf_sock_ops_kern;
+struct bpf_cgroup_storage;
 
 #ifdef CONFIG_CGROUP_BPF
 
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_cgroup_storage_map;
+
+struct bpf_storage_buffer {
+	struct rcu_head rcu;
+	char data[0];
+};
+
+struct bpf_cgroup_storage {
+	struct bpf_storage_buffer *buf;
+	struct bpf_cgroup_storage_map *map;
+	struct bpf_cgroup_storage_key key;
+	struct list_head list;
+	struct rb_node node;
+	struct rcu_head rcu;
+};
+
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
@@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+			     struct cgroup *cgroup,
+			     enum bpf_attach_type type);
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
+					    struct bpf_map *map) { return 0; }
+static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
+					      struct bpf_map *map) {}
+static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
+	struct bpf_prog *prog) { return 0; }
+static inline void bpf_cgroup_storage_free(
+	struct bpf_cgroup_storage *storage) {}
+
 #define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5a4a256473c3..9d1e4727495e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -282,6 +282,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
+	struct bpf_map *cgroup_storage;
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
 	void *security;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..add08be53b6f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
 #ifdef CONFIG_CGROUPS
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+#endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..b10118ee5afe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
 	__u8	data[0];	/* Arbitrary size */
 };
 
+struct bpf_cgroup_storage_key {
+	__u64	cgroup_inode_id;	/* cgroup inode id */
+	__u32	attach_type;		/* program attach type */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
+	BPF_MAP_TYPE_CGROUP_STORAGE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..e8906cbad81f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,6 +3,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
new file mode 100644
index 000000000000..f23d3fdeba23
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,376 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define LOCAL_STORAGE_CREATE_FLAG_MASK					\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_cgroup_storage_map {
+	struct bpf_map map;
+
+	spinlock_t lock;
+	struct bpf_prog *prog;
+	struct rb_root root;
+	struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static int bpf_cgroup_storage_key_cmp(
+	const struct bpf_cgroup_storage_key *key1,
+	const struct bpf_cgroup_storage_key *key2)
+{
+	if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+		return -1;
+	else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+		return 1;
+	else if (key1->attach_type < key2->attach_type)
+		return -1;
+	else if (key1->attach_type > key2->attach_type)
+		return 1;
+	return 0;
+}
+
+static struct bpf_cgroup_storage *cgroup_storage_lookup(
+	struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
+	bool locked)
+{
+	struct rb_root *root = &map->root;
+	struct rb_node *node;
+
+	if (!locked)
+		spin_lock_bh(&map->lock);
+
+	node = root->rb_node;
+	while (node) {
+		struct bpf_cgroup_storage *storage;
+
+		storage = container_of(node, struct bpf_cgroup_storage, node);
+
+		switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+		case -1:
+			node = node->rb_left;
+			break;
+		case 1:
+			node = node->rb_right;
+			break;
+		default:
+			if (!locked)
+				spin_unlock_bh(&map->lock);
+			return storage;
+		}
+	}
+
+	if (!locked)
+		spin_unlock_bh(&map->lock);
+
+	return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+				 struct bpf_cgroup_storage *storage)
+{
+	struct rb_root *root = &map->root;
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct bpf_cgroup_storage *this;
+
+		this = container_of(*new, struct bpf_cgroup_storage, node);
+
+		parent = *new;
+		switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+		case -1:
+			new = &((*new)->rb_left);
+			break;
+		case 1:
+			new = &((*new)->rb_right);
+			break;
+		default:
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&storage->node, parent, new);
+	rb_insert_color(&storage->node, root);
+
+	return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage)
+		return NULL;
+
+	return &READ_ONCE(storage->buf)->data[0];
+}
+
+static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+				      void *value, u64 flags)
+{
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	struct bpf_storage_buffer *new;
+
+	if (flags & BPF_NOEXIST)
+		return -EINVAL;
+
+	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+					key, false);
+	if (!storage)
+		return -ENOENT;
+
+	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+			   map->value_size, __GFP_ZERO | GFP_USER,
+			   map->numa_node);
+	if (!new)
+		return -ENOMEM;
+
+	memcpy(&new->data[0], value, map->value_size);
+
+	new = xchg(&storage->buf, new);
+	kfree_rcu(new, rcu);
+
+	return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+				       void *_next_key)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage_key *next = _next_key;
+	struct bpf_cgroup_storage *storage;
+
+	spin_lock_bh(&map->lock);
+
+	if (list_empty(&map->list))
+		goto enoent;
+
+	if (key) {
+		storage = cgroup_storage_lookup(map, key, true);
+		if (!storage)
+			goto enoent;
+
+		storage = list_next_entry(storage, list);
+		if (!storage)
+			goto enoent;
+	} else {
+		storage = list_first_entry(&map->list,
+					 struct bpf_cgroup_storage, list);
+	}
+
+	spin_unlock_bh(&map->lock);
+	next->attach_type = storage->key.attach_type;
+	next->cgroup_inode_id = storage->key.cgroup_inode_id;
+	return 0;
+
+enoent:
+	spin_unlock_bh(&map->lock);
+	return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+	int numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_cgroup_storage_map *map;
+
+	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+		return ERR_PTR(-EINVAL);
+
+	if (attr->value_size > PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
+	if (attr->max_entries)
+		/* max_entries is not used and enforced to be 0 */
+		return ERR_PTR(-EINVAL);
+
+	map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
+			   __GFP_ZERO | GFP_USER, numa_node);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+				  PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* copy mandatory map attributes */
+	bpf_map_init_from_attr(&map->map, attr);
+
+	spin_lock_init(&map->lock);
+	map->root = RB_ROOT;
+	INIT_LIST_HEAD(&map->list);
+
+	return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+	WARN_ON(!RB_EMPTY_ROOT(&map->root));
+	WARN_ON(!list_empty(&map->list));
+
+	kfree(map);
+}
+
+static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+const struct bpf_map_ops cgroup_storage_map_ops = {
+	.map_alloc = cgroup_storage_map_alloc,
+	.map_free = cgroup_storage_map_free,
+	.map_get_next_key = cgroup_storage_get_next_key,
+	.map_lookup_elem = cgroup_storage_lookup_elem,
+	.map_update_elem = cgroup_storage_update_elem,
+	.map_delete_elem = cgroup_storage_delete_elem,
+};
+
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	int ret = -EBUSY;
+
+	spin_lock_bh(&map->lock);
+
+	if (map->prog && map->prog != prog)
+		goto unlock;
+	if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+		goto unlock;
+
+	map->prog = prog;
+	prog->aux->cgroup_storage = _map;
+	ret = 0;
+unlock:
+	spin_unlock_bh(&map->lock);
+
+	return ret;
+}
+
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+	spin_lock_bh(&map->lock);
+	if (map->prog == prog) {
+		WARN_ON(prog->aux->cgroup_storage != _map);
+		map->prog = NULL;
+		prog->aux->cgroup_storage = NULL;
+	}
+	spin_unlock_bh(&map->lock);
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+{
+	struct bpf_cgroup_storage *storage;
+	struct bpf_map *map;
+	u32 pages;
+
+	map = prog->aux->cgroup_storage;
+	if (!map)
+		return NULL;
+
+	pages = round_up(sizeof(struct bpf_cgroup_storage) +
+			 sizeof(struct bpf_storage_buffer) +
+			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	if (bpf_map_charge_memlock(map, pages))
+		return ERR_PTR(-EPERM);
+
+	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
+			       __GFP_ZERO | GFP_USER, map->numa_node);
+	if (!storage) {
+		bpf_map_uncharge_memlock(map, pages);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+				    map->value_size, __GFP_ZERO | GFP_USER,
+				    map->numa_node);
+	if (!storage->buf) {
+		bpf_map_uncharge_memlock(map, pages);
+		kfree(storage);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	storage->map = (struct bpf_cgroup_storage_map *)map;
+
+	return storage;
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+	u32 pages;
+	struct bpf_map *map;
+
+	if (!storage)
+		return;
+
+	map = &storage->map->map;
+	pages = round_up(sizeof(struct bpf_cgroup_storage) +
+			 sizeof(struct bpf_storage_buffer) +
+			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	bpf_map_uncharge_memlock(map, pages);
+
+	kfree_rcu(storage->buf, rcu);
+	kfree_rcu(storage, rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+			     struct cgroup *cgroup,
+			     enum bpf_attach_type type)
+{
+	struct bpf_cgroup_storage_map *map;
+
+	if (!storage)
+		return;
+
+	storage->key.attach_type = type;
+	storage->key.cgroup_inode_id = cgroup->kn->id.id;
+
+	map = storage->map;
+
+	spin_lock_bh(&map->lock);
+	WARN_ON(cgroup_storage_insert(map, storage));
+	list_add(&storage->list, &map->list);
+	spin_unlock_bh(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+	struct bpf_cgroup_storage_map *map;
+	struct rb_root *root;
+
+	if (!storage)
+		return;
+
+	map = storage->map;
+
+	spin_lock_bh(&map->lock);
+	root = &map->root;
+	rb_erase(&storage->node, root);
+
+	list_del(&storage->list);
+	spin_unlock_bh(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7958252a4d29..5af4e9e2722d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 {
 	int i;
 
+	if (aux->cgroup_storage)
+		bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+
 	for (i = 0; i < aux->used_map_cnt; i++)
 		bpf_map_put(aux->used_maps[i]);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e948303a0ea8..7e75434a9e54 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			}
 			env->used_maps[env->used_map_cnt++] = map;
 
+			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+			    bpf_cgroup_storage_assign(env->prog, map)) {
+				verbose(env,
+					"only one cgroup storage is allowed\n");
+				fdput(f);
+				return -EBUSY;
+			}
+
 			fdput(f);
 next_insn:
 			insn++;
@@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env)
 {
 	int i;
 
+	if (env->prog->aux->cgroup_storage)
+		bpf_cgroup_storage_release(env->prog,
+					   env->prog->aux->cgroup_storage);
+
 	for (i = 0; i < env->used_map_cnt; i++)
 		bpf_map_put(env->used_maps[i]);
 }
-- 
cgit v1.2.3


From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:19 -0700
Subject: bpf: pass a pointer to a cgroup storage using pcpu variable

This commit introduces the bpf_cgroup_storage_set() helper,
which will be used to pass a pointer to a cgroup storage
to the bpf helper.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 15 +++++++++++++++
 kernel/bpf/local_storage.c |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7d00d58869ed..9a144ddbbc8f 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -4,6 +4,7 @@
 
 #include <linux/errno.h>
 #include <linux/jump_label.h>
+#include <linux/percpu.h>
 #include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
 
@@ -21,6 +22,8 @@ struct bpf_cgroup_storage;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+
 struct bpf_cgroup_storage_map;
 
 struct bpf_storage_buffer {
@@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
+{
+	struct bpf_storage_buffer *buf;
+
+	if (!storage)
+		return;
+
+	buf = READ_ONCE(storage->buf);
+	this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
+}
+
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
@@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
 					    struct bpf_map *map) { return 0; }
 static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index f23d3fdeba23..fc4e37f68f2a 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,6 +7,8 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
+DEFINE_PER_CPU(void*, bpf_cgroup_storage);
+
 #ifdef CONFIG_CGROUP_BPF
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
-- 
cgit v1.2.3


From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:20 -0700
Subject: bpf: allocate cgroup storage entries on attaching bpf programs

If a bpf program is using cgroup local storage, allocate
a bpf_cgroup_storage structure automatically on attaching the program
to a cgroup and save the pointer into the corresponding bpf_prog_list
entry.
Analogically, release the cgroup local storage on detaching
of the bpf program.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  1 +
 kernel/bpf/cgroup.c        | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 9a144ddbbc8f..f91b0f8ff3a9 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -43,6 +43,7 @@ struct bpf_cgroup_storage {
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
+	struct bpf_cgroup_storage *storage;
 };
 
 struct bpf_prog_array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index badabb0b435c..935274c86bfe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 		list_for_each_entry_safe(pl, tmp, progs, node) {
 			list_del(&pl->node);
 			bpf_prog_put(pl->prog);
+			bpf_cgroup_storage_unlink(pl->storage);
+			bpf_cgroup_storage_free(pl->storage);
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
@@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
+	struct bpf_cgroup_storage *storage, *old_storage = NULL;
 	struct cgroup_subsys_state *css;
 	struct bpf_prog_list *pl;
 	bool pl_was_allocated;
@@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 		return -E2BIG;
 
+	storage = bpf_cgroup_storage_alloc(prog);
+	if (IS_ERR(storage))
+		return -ENOMEM;
+
 	if (flags & BPF_F_ALLOW_MULTI) {
-		list_for_each_entry(pl, progs, node)
-			if (pl->prog == prog)
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog == prog) {
 				/* disallow attaching the same prog twice */
+				bpf_cgroup_storage_free(storage);
 				return -EINVAL;
+			}
+		}
 
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
-		if (!pl)
+		if (!pl) {
+			bpf_cgroup_storage_free(storage);
 			return -ENOMEM;
+		}
+
 		pl_was_allocated = true;
 		pl->prog = prog;
+		pl->storage = storage;
 		list_add_tail(&pl->node, progs);
 	} else {
 		if (list_empty(progs)) {
 			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
-			if (!pl)
+			if (!pl) {
+				bpf_cgroup_storage_free(storage);
 				return -ENOMEM;
+			}
 			pl_was_allocated = true;
 			list_add_tail(&pl->node, progs);
 		} else {
 			pl = list_first_entry(progs, typeof(*pl), node);
 			old_prog = pl->prog;
+			old_storage = pl->storage;
+			bpf_cgroup_storage_unlink(old_storage);
 			pl_was_allocated = false;
 		}
 		pl->prog = prog;
+		pl->storage = storage;
 	}
 
 	cgrp->bpf.flags[type] = flags;
@@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	}
 
 	static_branch_inc(&cgroup_bpf_enabled_key);
+	if (old_storage)
+		bpf_cgroup_storage_free(old_storage);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
+	bpf_cgroup_storage_link(storage, cgrp, type);
 	return 0;
 
 cleanup:
@@ -276,6 +298,9 @@ cleanup:
 
 	/* and cleanup the prog list */
 	pl->prog = old_prog;
+	bpf_cgroup_storage_free(pl->storage);
+	pl->storage = old_storage;
+	bpf_cgroup_storage_link(old_storage, cgrp, type);
 	if (pl_was_allocated) {
 		list_del(&pl->node);
 		kfree(pl);
@@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	list_del(&pl->node);
+	bpf_cgroup_storage_unlink(pl->storage);
+	bpf_cgroup_storage_free(pl->storage);
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
-- 
cgit v1.2.3


From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:21 -0700
Subject: bpf: extend bpf_prog_array to store pointers to the cgroup storage

This patch converts bpf_prog_array from an array of prog pointers
to the array of struct bpf_prog_array_item elements.

This allows to save a cgroup storage pointer for each bpf program
efficiently attached to a cgroup.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/media/rc/bpf-lirc.c | 10 +++---
 include/linux/bpf.h         | 19 ++++++++----
 kernel/bpf/cgroup.c         | 21 +++++++------
 kernel/bpf/core.c           | 76 +++++++++++++++++++++++----------------------
 4 files changed, 70 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index fcfab6635f9c..8c26df9b96c1 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -195,14 +195,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
  */
 void lirc_bpf_free(struct rc_dev *rcdev)
 {
-	struct bpf_prog **progs;
+	struct bpf_prog_array_item *item;
 
 	if (!rcdev->raw->progs)
 		return;
 
-	progs = rcu_dereference(rcdev->raw->progs)->progs;
-	while (*progs)
-		bpf_prog_put(*progs++);
+	item = rcu_dereference(rcdev->raw->progs)->items;
+	while (item->prog) {
+		bpf_prog_put(item->prog);
+		item++;
+	}
 
 	bpf_prog_array_free(rcdev->raw->progs);
 }
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9d1e4727495e..16be67888c30 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -349,9 +349,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
  * The 'struct bpf_prog_array *' should only be replaced with xchg()
  * since other cpus are walking the array of pointers in parallel.
  */
+struct bpf_prog_array_item {
+	struct bpf_prog *prog;
+	struct bpf_cgroup_storage *cgroup_storage;
+};
+
 struct bpf_prog_array {
 	struct rcu_head rcu;
-	struct bpf_prog *progs[0];
+	struct bpf_prog_array_item items[0];
 };
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
@@ -372,7 +377,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 
 #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array_item *_item;	\
+		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		preempt_disable();			\
@@ -380,10 +386,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 		_array = rcu_dereference(array);	\
 		if (unlikely(check_non_null && !_array))\
 			goto _out;			\
-		_prog = _array->progs;			\
-		while ((__prog = READ_ONCE(*_prog))) {	\
-			_ret &= func(__prog, ctx);	\
-			_prog++;			\
+		_item = &_array->items[0];		\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			_ret &= func(_prog, ctx);	\
+			_item++;			\
 		}					\
 _out:							\
 		rcu_read_unlock();			\
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 935274c86bfe..ddfa6cc13e57 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp,
 	cnt = 0;
 	p = cgrp;
 	do {
-		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
-			list_for_each_entry(pl,
-					    &p->bpf.progs[type], node) {
-				if (!pl->prog)
-					continue;
-				progs->progs[cnt++] = pl->prog;
-			}
-		p = cgroup_parent(p);
-	} while (p);
+		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			continue;
+
+		list_for_each_entry(pl, &p->bpf.progs[type], node) {
+			if (!pl->prog)
+				continue;
+
+			progs->items[cnt].prog = pl->prog;
+			progs->items[cnt].cgroup_storage = pl->storage;
+			cnt++;
+		}
+	} while ((p = cgroup_parent(p)));
 
 	rcu_assign_pointer(*array, progs);
 	return 0;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 253aa8e79c7b..9abcf25ebf9f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1542,7 +1542,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 {
 	if (prog_cnt)
 		return kzalloc(sizeof(struct bpf_prog_array) +
-			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       sizeof(struct bpf_prog_array_item) *
+			       (prog_cnt + 1),
 			       flags);
 
 	return &empty_prog_array.hdr;
@@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
 {
-	struct bpf_prog **prog;
+	struct bpf_prog_array_item *item;
 	u32 cnt = 0;
 
 	rcu_read_lock();
-	prog = rcu_dereference(progs)->progs;
-	for (; *prog; prog++)
-		if (*prog != &dummy_bpf_prog.prog)
+	item = rcu_dereference(array)->items;
+	for (; item->prog; item++)
+		if (item->prog != &dummy_bpf_prog.prog)
 			cnt++;
 	rcu_read_unlock();
 	return cnt;
 }
 
-static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
 				     u32 *prog_ids,
 				     u32 request_cnt)
 {
+	struct bpf_prog_array_item *item;
 	int i = 0;
 
-	for (; *prog; prog++) {
-		if (*prog == &dummy_bpf_prog.prog)
+	item = rcu_dereference(array)->items;
+	for (; item->prog; item++) {
+		if (item->prog == &dummy_bpf_prog.prog)
 			continue;
-		prog_ids[i] = (*prog)->aux->id;
+		prog_ids[i] = item->prog->aux->id;
 		if (++i == request_cnt) {
-			prog++;
+			item++;
 			break;
 		}
 	}
 
-	return !!(*prog);
+	return !!(item->prog);
 }
 
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
 				__u32 __user *prog_ids, u32 cnt)
 {
-	struct bpf_prog **prog;
 	unsigned long err = 0;
 	bool nospc;
 	u32 *ids;
@@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	if (!ids)
 		return -ENOMEM;
 	rcu_read_lock();
-	prog = rcu_dereference(progs)->progs;
-	nospc = bpf_prog_array_copy_core(prog, ids, cnt);
+	nospc = bpf_prog_array_copy_core(array, ids, cnt);
 	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
@@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	return 0;
 }
 
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
 				struct bpf_prog *old_prog)
 {
-	struct bpf_prog **prog = progs->progs;
+	struct bpf_prog_array_item *item = array->items;
 
-	for (; *prog; prog++)
-		if (*prog == old_prog) {
-			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+	for (; item->prog; item++)
+		if (item->prog == old_prog) {
+			WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
 			break;
 		}
 }
@@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 			struct bpf_prog_array **new_array)
 {
 	int new_prog_cnt, carry_prog_cnt = 0;
-	struct bpf_prog **existing_prog;
+	struct bpf_prog_array_item *existing;
 	struct bpf_prog_array *array;
 	bool found_exclude = false;
 	int new_prog_idx = 0;
@@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	 * the new array.
 	 */
 	if (old_array) {
-		existing_prog = old_array->progs;
-		for (; *existing_prog; existing_prog++) {
-			if (*existing_prog == exclude_prog) {
+		existing = old_array->items;
+		for (; existing->prog; existing++) {
+			if (existing->prog == exclude_prog) {
 				found_exclude = true;
 				continue;
 			}
-			if (*existing_prog != &dummy_bpf_prog.prog)
+			if (existing->prog != &dummy_bpf_prog.prog)
 				carry_prog_cnt++;
-			if (*existing_prog == include_prog)
+			if (existing->prog == include_prog)
 				return -EEXIST;
 		}
 	}
@@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 
 	/* Fill in the new prog array */
 	if (carry_prog_cnt) {
-		existing_prog = old_array->progs;
-		for (; *existing_prog; existing_prog++)
-			if (*existing_prog != exclude_prog &&
-			    *existing_prog != &dummy_bpf_prog.prog)
-				array->progs[new_prog_idx++] = *existing_prog;
+		existing = old_array->items;
+		for (; existing->prog; existing++)
+			if (existing->prog != exclude_prog &&
+			    existing->prog != &dummy_bpf_prog.prog) {
+				array->items[new_prog_idx++].prog =
+					existing->prog;
+			}
 	}
 	if (include_prog)
-		array->progs[new_prog_idx++] = include_prog;
-	array->progs[new_prog_idx] = NULL;
+		array->items[new_prog_idx++].prog = include_prog;
+	array->items[new_prog_idx].prog = NULL;
 	*new_array = array;
 	return 0;
 }
@@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt)
 {
-	struct bpf_prog **prog;
 	u32 cnt = 0;
 
 	if (array)
@@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
 		return 0;
 
 	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
-	prog = rcu_dereference_check(array, 1)->progs;
-	return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+	return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
 								     : 0;
 }
 
-- 
cgit v1.2.3


From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:22 -0700
Subject: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE

BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way
that the access from the bpf program side is lookup-free.
That means the result is guaranteed to be a valid
pointer to the cgroup storage; no NULL-check is required.

This patch introduces BPF_PTR_TO_MAP_VALUE return type,
which is required to cause the verifier accept programs,
which are not checking the map value pointer for being NULL.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   | 1 +
 kernel/bpf/verifier.c | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 16be67888c30..ca4ac2a39def 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -155,6 +155,7 @@ enum bpf_arg_type {
 enum bpf_return_type {
 	RET_INTEGER,			/* function returns integer */
 	RET_VOID,			/* function doesn't return anything */
+	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7e75434a9e54..1ede16c8bb40 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
-	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
-		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
+		   fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+		if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
+			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+		else
+			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
-- 
cgit v1.2.3


From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 2 Aug 2018 14:27:24 -0700
Subject: bpf: introduce the bpf_get_local_storage() helper function

The bpf_get_local_storage() helper function is used
to get a pointer to the bpf local storage from a bpf program.

It takes a pointer to a storage map and flags as arguments.
Right now it accepts only cgroup storage maps, and flags
argument has to be 0. Further it can be extended to support
other types of local storage: e.g. thread local storage etc.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 kernel/bpf/cgroup.c      |  2 ++
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 20 ++++++++++++++++++++
 kernel/bpf/verifier.c    | 18 ++++++++++++++++++
 net/core/filter.c        | 23 ++++++++++++++++++++++-
 7 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ca4ac2a39def..cd8790d2c6ed 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
+extern const struct bpf_func_proto bpf_get_local_storage_proto;
+
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b10118ee5afe..dd5758dc35d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2095,6 +2095,24 @@ union bpf_attr {
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
+ *
+ * void* get_local_storage(void *map, u64 flags)
+ *	Description
+ *		Get the pointer to the local storage area.
+ *		The type and the size of the local storage is defined
+ *		by the *map* argument.
+ *		The *flags* meaning is specific for each map type,
+ *		and has to be 0 for cgroup local storage.
+ *
+ *		Depending on the bpf program type, a local storage area
+ *		can be shared between multiple instances of the bpf program,
+ *		running simultaneously.
+ *
+ *		A user should care about the synchronization by himself.
+ *		For example, by using the BPF_STX_XADD instruction to alter
+ *		the shared data.
+ *	Return
+ *		Pointer to the local storage area.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2177,7 +2195,8 @@ union bpf_attr {
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
-	FN(get_current_cgroup_id),
+	FN(get_current_cgroup_id),	\
+	FN(get_local_storage),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ddfa6cc13e57..0a4fe5a7dc91 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_delete_elem_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_trace_printk:
 		if (capable(CAP_SYS_ADMIN))
 			return bpf_get_trace_printk_proto();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9abcf25ebf9f..4d09e610777f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 73065e2d23c2..1991466b8327 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+	/* map and flags arguments are not used now,
+	 * but provide an ability to extend the API
+	 * for other types of local storages.
+	 * verifier checks that their values are correct.
+	 */
+	return (unsigned long) this_cpu_read(bpf_cgroup_storage);
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+	.func		= bpf_get_local_storage,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ede16c8bb40..587468a9c37d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGROUP_STORAGE:
+		if (func_id != BPF_FUNC_get_local_storage)
+			goto error;
+		break;
 	/* devmap returns a pointer to a live net_device ifindex that we cannot
 	 * allow to be modified from bpf side. So do not allow lookup elements
 	 * for now.
@@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
+	case BPF_FUNC_get_local_storage:
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	}
 
 	regs = cur_regs(env);
+
+	/* check that flags argument in get_local_storage(map, flags) is 0,
+	 * this is required because get_local_storage() can't return an error.
+	 */
+	if (func_id == BPF_FUNC_get_local_storage &&
+	    !register_is_null(&regs[BPF_REG_2])) {
+		verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+		return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/net/core/filter.c b/net/core/filter.c
index 9bb9a4488e25..9f73aae2f089 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	 */
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		}
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_addr_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
+	default:
+		return sk_filter_func_proto(func_id, prog);
+	}
+}
+
 static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_hash_update_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_ops_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_cork_bytes_proto;
 	case BPF_FUNC_msg_pull_data:
 		return &bpf_msg_pull_data_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_map_proto;
 	case BPF_FUNC_sk_redirect_hash:
 		return &bpf_sk_redirect_hash_proto;
+	case BPF_FUNC_get_local_storage:
+		return &bpf_get_local_storage_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 };
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
-	.get_func_proto		= sk_filter_func_proto,
+	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
-- 
cgit v1.2.3


From 401c0d7712eb3189023efc9c0708a2ac984ed62e Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@bootlin.com>
Date: Fri, 3 Aug 2018 11:50:39 +0200
Subject: spi: spi-mem: Constify spi_mem->name

There is no reason to make spi_mem->name modifiable. Moreover,
spi_mem_ops->get_name() returns a const char *, which generates a gcc
warning when assigning the value returned by spi_mem_ops->get_name()
to spi_mem->name.

Fixes: 5d27a9c8ea9e ("spi: spi-mem: Extend the SPI mem interface to set a custom memory name")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi-mem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 0d64ccc4e584..62722fb7472d 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -134,7 +134,7 @@ struct spi_mem_op {
 struct spi_mem {
 	struct spi_device *spi;
 	void *drvpriv;
-	char *name;
+	const char *name;
 };
 
 /**
-- 
cgit v1.2.3


From 2ad5157650b446f14a719280ba3670303bc4f439 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 11 Jul 2018 18:42:11 -0500
Subject: mailbox/omap: switch to SPDX license identifier

Use the appropriate SPDX license identifier in the OMAP Mailbox
driver source files and drop the previous boilerplate license text.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/omap-mailbox.c | 10 +---------
 include/linux/omap-mailbox.h   |  5 +----
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index e1e2c085e68e..97c7d9b7f46f 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * OMAP mailbox driver
  *
@@ -6,15 +7,6 @@
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
  */
 
 #include <linux/interrupt.h>
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index c726bd833761..6dbcd2da0332 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * omap-mailbox: interprocessor communication module for OMAP
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef OMAP_MAILBOX_H
-- 
cgit v1.2.3


From 623a6143a845bd485b00ba684f0ccef11835edab Mon Sep 17 00:00:00 2001
From: Houlong Wei <houlong.wei@mediatek.com>
Date: Wed, 25 Jul 2018 09:26:40 +0800
Subject: mailbox: mediatek: Add Mediatek CMDQ driver

This patch is first version of Mediatek Command Queue(CMDQ) driver. The
CMDQ is used to help write registers with critical time limitation,
such as updating display configuration during the vblank. It controls
Global Command Engine (GCE) hardware to achieve this requirement.
Currently, CMDQ only supports display related hardwares, but we expect
it can be extended to other hardwares for future requirements.

Signed-off-by: Houlong Wei <houlong.wei@mediatek.com>
Signed-off-by: HS Liao <hs.liao@mediatek.com>
Signed-off-by: CK Hu <ck.hu@mediatek.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig                  |  10 +
 drivers/mailbox/Makefile                 |   2 +
 drivers/mailbox/mtk-cmdq-mailbox.c       | 569 +++++++++++++++++++++++++++++++
 include/linux/mailbox/mtk-cmdq-mailbox.h |  77 +++++
 4 files changed, 658 insertions(+)
 create mode 100644 drivers/mailbox/mtk-cmdq-mailbox.c
 create mode 100644 include/linux/mailbox/mtk-cmdq-mailbox.h

(limited to 'include/linux')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index e63d29a95e76..2bbabc907add 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -189,4 +189,14 @@ config STM32_IPCC
 	  Mailbox implementation for STMicroelectonics STM32 family chips
 	  with hardware for Inter-Processor Communication Controller (IPCC)
 	  between processors. Say Y here if you want to have this support.
+
+config MTK_CMDQ_MBOX
+	tristate "MediaTek CMDQ Mailbox Support"
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	select MTK_INFRACFG
+	help
+	  Say yes here to add support for the MediaTek Command Queue (CMDQ)
+	  mailbox driver. The CMDQ is used to help read/write registers with
+	  critical time limitation, such as updating display configuration
+	  during the vblank.
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 4d501bea7863..4b0080480661 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -40,3 +40,5 @@ obj-$(CONFIG_QCOM_APCS_IPC)	+= qcom-apcs-ipc-mailbox.o
 obj-$(CONFIG_TEGRA_HSP_MBOX)	+= tegra-hsp.o
 
 obj-$(CONFIG_STM32_IPCC) 	+= stm32-ipcc.o
+
+obj-$(CONFIG_MTK_CMDQ_MBOX)	+= mtk-cmdq-mailbox.o
diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
new file mode 100644
index 000000000000..6f92c5ee12f7
--- /dev/null
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (c) 2018 MediaTek Inc.
+
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox/mtk-cmdq-mailbox.h>
+#include <linux/of_device.h>
+
+#define CMDQ_OP_CODE_MASK		(0xff << CMDQ_OP_CODE_SHIFT)
+#define CMDQ_IRQ_MASK			0xffff
+#define CMDQ_NUM_CMD(t)			(t->cmd_buf_size / CMDQ_INST_SIZE)
+
+#define CMDQ_CURR_IRQ_STATUS		0x10
+#define CMDQ_THR_SLOT_CYCLES		0x30
+#define CMDQ_THR_BASE			0x100
+#define CMDQ_THR_SIZE			0x80
+#define CMDQ_THR_WARM_RESET		0x00
+#define CMDQ_THR_ENABLE_TASK		0x04
+#define CMDQ_THR_SUSPEND_TASK		0x08
+#define CMDQ_THR_CURR_STATUS		0x0c
+#define CMDQ_THR_IRQ_STATUS		0x10
+#define CMDQ_THR_IRQ_ENABLE		0x14
+#define CMDQ_THR_CURR_ADDR		0x20
+#define CMDQ_THR_END_ADDR		0x24
+#define CMDQ_THR_WAIT_TOKEN		0x30
+#define CMDQ_THR_PRIORITY		0x40
+
+#define CMDQ_THR_ACTIVE_SLOT_CYCLES	0x3200
+#define CMDQ_THR_ENABLED		0x1
+#define CMDQ_THR_DISABLED		0x0
+#define CMDQ_THR_SUSPEND		0x1
+#define CMDQ_THR_RESUME			0x0
+#define CMDQ_THR_STATUS_SUSPENDED	BIT(1)
+#define CMDQ_THR_DO_WARM_RESET		BIT(0)
+#define CMDQ_THR_IRQ_DONE		0x1
+#define CMDQ_THR_IRQ_ERROR		0x12
+#define CMDQ_THR_IRQ_EN			(CMDQ_THR_IRQ_ERROR | CMDQ_THR_IRQ_DONE)
+#define CMDQ_THR_IS_WAITING		BIT(31)
+
+#define CMDQ_JUMP_BY_OFFSET		0x10000000
+#define CMDQ_JUMP_BY_PA			0x10000001
+
+struct cmdq_thread {
+	struct mbox_chan	*chan;
+	void __iomem		*base;
+	struct list_head	task_busy_list;
+	u32			priority;
+	bool			atomic_exec;
+};
+
+struct cmdq_task {
+	struct cmdq		*cmdq;
+	struct list_head	list_entry;
+	dma_addr_t		pa_base;
+	struct cmdq_thread	*thread;
+	struct cmdq_pkt		*pkt; /* the packet sent from mailbox client */
+};
+
+struct cmdq {
+	struct mbox_controller	mbox;
+	void __iomem		*base;
+	u32			irq;
+	u32			thread_nr;
+	struct cmdq_thread	*thread;
+	struct clk		*clock;
+	bool			suspended;
+};
+
+static int cmdq_thread_suspend(struct cmdq *cmdq, struct cmdq_thread *thread)
+{
+	u32 status;
+
+	writel(CMDQ_THR_SUSPEND, thread->base + CMDQ_THR_SUSPEND_TASK);
+
+	/* If already disabled, treat as suspended successful. */
+	if (!(readl(thread->base + CMDQ_THR_ENABLE_TASK) & CMDQ_THR_ENABLED))
+		return 0;
+
+	if (readl_poll_timeout_atomic(thread->base + CMDQ_THR_CURR_STATUS,
+			status, status & CMDQ_THR_STATUS_SUSPENDED, 0, 10)) {
+		dev_err(cmdq->mbox.dev, "suspend GCE thread 0x%x failed\n",
+			(u32)(thread->base - cmdq->base));
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void cmdq_thread_resume(struct cmdq_thread *thread)
+{
+	writel(CMDQ_THR_RESUME, thread->base + CMDQ_THR_SUSPEND_TASK);
+}
+
+static void cmdq_init(struct cmdq *cmdq)
+{
+	WARN_ON(clk_enable(cmdq->clock) < 0);
+	writel(CMDQ_THR_ACTIVE_SLOT_CYCLES, cmdq->base + CMDQ_THR_SLOT_CYCLES);
+	clk_disable(cmdq->clock);
+}
+
+static int cmdq_thread_reset(struct cmdq *cmdq, struct cmdq_thread *thread)
+{
+	u32 warm_reset;
+
+	writel(CMDQ_THR_DO_WARM_RESET, thread->base + CMDQ_THR_WARM_RESET);
+	if (readl_poll_timeout_atomic(thread->base + CMDQ_THR_WARM_RESET,
+			warm_reset, !(warm_reset & CMDQ_THR_DO_WARM_RESET),
+			0, 10)) {
+		dev_err(cmdq->mbox.dev, "reset GCE thread 0x%x failed\n",
+			(u32)(thread->base - cmdq->base));
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void cmdq_thread_disable(struct cmdq *cmdq, struct cmdq_thread *thread)
+{
+	cmdq_thread_reset(cmdq, thread);
+	writel(CMDQ_THR_DISABLED, thread->base + CMDQ_THR_ENABLE_TASK);
+}
+
+/* notify GCE to re-fetch commands by setting GCE thread PC */
+static void cmdq_thread_invalidate_fetched_data(struct cmdq_thread *thread)
+{
+	writel(readl(thread->base + CMDQ_THR_CURR_ADDR),
+	       thread->base + CMDQ_THR_CURR_ADDR);
+}
+
+static void cmdq_task_insert_into_thread(struct cmdq_task *task)
+{
+	struct device *dev = task->cmdq->mbox.dev;
+	struct cmdq_thread *thread = task->thread;
+	struct cmdq_task *prev_task = list_last_entry(
+			&thread->task_busy_list, typeof(*task), list_entry);
+	u64 *prev_task_base = prev_task->pkt->va_base;
+
+	/* let previous task jump to this task */
+	dma_sync_single_for_cpu(dev, prev_task->pa_base,
+				prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
+	prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] =
+		(u64)CMDQ_JUMP_BY_PA << 32 | task->pa_base;
+	dma_sync_single_for_device(dev, prev_task->pa_base,
+				   prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
+
+	cmdq_thread_invalidate_fetched_data(thread);
+}
+
+static bool cmdq_command_is_wfe(u64 cmd)
+{
+	u64 wfe_option = CMDQ_WFE_UPDATE | CMDQ_WFE_WAIT | CMDQ_WFE_WAIT_VALUE;
+	u64 wfe_op = (u64)(CMDQ_CODE_WFE << CMDQ_OP_CODE_SHIFT) << 32;
+	u64 wfe_mask = (u64)CMDQ_OP_CODE_MASK << 32 | 0xffffffff;
+
+	return ((cmd & wfe_mask) == (wfe_op | wfe_option));
+}
+
+/* we assume tasks in the same display GCE thread are waiting the same event. */
+static void cmdq_task_remove_wfe(struct cmdq_task *task)
+{
+	struct device *dev = task->cmdq->mbox.dev;
+	u64 *base = task->pkt->va_base;
+	int i;
+
+	dma_sync_single_for_cpu(dev, task->pa_base, task->pkt->cmd_buf_size,
+				DMA_TO_DEVICE);
+	for (i = 0; i < CMDQ_NUM_CMD(task->pkt); i++)
+		if (cmdq_command_is_wfe(base[i]))
+			base[i] = (u64)CMDQ_JUMP_BY_OFFSET << 32 |
+				  CMDQ_JUMP_PASS;
+	dma_sync_single_for_device(dev, task->pa_base, task->pkt->cmd_buf_size,
+				   DMA_TO_DEVICE);
+}
+
+static bool cmdq_thread_is_in_wfe(struct cmdq_thread *thread)
+{
+	return readl(thread->base + CMDQ_THR_WAIT_TOKEN) & CMDQ_THR_IS_WAITING;
+}
+
+static void cmdq_thread_wait_end(struct cmdq_thread *thread,
+				 unsigned long end_pa)
+{
+	struct device *dev = thread->chan->mbox->dev;
+	unsigned long curr_pa;
+
+	if (readl_poll_timeout_atomic(thread->base + CMDQ_THR_CURR_ADDR,
+			curr_pa, curr_pa == end_pa, 1, 20))
+		dev_err(dev, "GCE thread cannot run to end.\n");
+}
+
+static void cmdq_task_exec_done(struct cmdq_task *task, enum cmdq_cb_status sta)
+{
+	struct cmdq_task_cb *cb = &task->pkt->async_cb;
+	struct cmdq_cb_data data;
+
+	WARN_ON(cb->cb == (cmdq_async_flush_cb)NULL);
+	data.sta = sta;
+	data.data = cb->data;
+	cb->cb(data);
+
+	list_del(&task->list_entry);
+}
+
+static void cmdq_task_handle_error(struct cmdq_task *task)
+{
+	struct cmdq_thread *thread = task->thread;
+	struct cmdq_task *next_task;
+
+	dev_err(task->cmdq->mbox.dev, "task 0x%p error\n", task);
+	WARN_ON(cmdq_thread_suspend(task->cmdq, thread) < 0);
+	next_task = list_first_entry_or_null(&thread->task_busy_list,
+			struct cmdq_task, list_entry);
+	if (next_task)
+		writel(next_task->pa_base, thread->base + CMDQ_THR_CURR_ADDR);
+	cmdq_thread_resume(thread);
+}
+
+static void cmdq_thread_irq_handler(struct cmdq *cmdq,
+				    struct cmdq_thread *thread)
+{
+	struct cmdq_task *task, *tmp, *curr_task = NULL;
+	u32 curr_pa, irq_flag, task_end_pa;
+	bool err;
+
+	irq_flag = readl(thread->base + CMDQ_THR_IRQ_STATUS);
+	writel(~irq_flag, thread->base + CMDQ_THR_IRQ_STATUS);
+
+	/*
+	 * When ISR call this function, another CPU core could run
+	 * "release task" right before we acquire the spin lock, and thus
+	 * reset / disable this GCE thread, so we need to check the enable
+	 * bit of this GCE thread.
+	 */
+	if (!(readl(thread->base + CMDQ_THR_ENABLE_TASK) & CMDQ_THR_ENABLED))
+		return;
+
+	if (irq_flag & CMDQ_THR_IRQ_ERROR)
+		err = true;
+	else if (irq_flag & CMDQ_THR_IRQ_DONE)
+		err = false;
+	else
+		return;
+
+	curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR);
+
+	list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
+				 list_entry) {
+		task_end_pa = task->pa_base + task->pkt->cmd_buf_size;
+		if (curr_pa >= task->pa_base && curr_pa < task_end_pa)
+			curr_task = task;
+
+		if (!curr_task || curr_pa == task_end_pa - CMDQ_INST_SIZE) {
+			cmdq_task_exec_done(task, CMDQ_CB_NORMAL);
+			kfree(task);
+		} else if (err) {
+			cmdq_task_exec_done(task, CMDQ_CB_ERROR);
+			cmdq_task_handle_error(curr_task);
+			kfree(task);
+		}
+
+		if (curr_task)
+			break;
+	}
+
+	if (list_empty(&thread->task_busy_list)) {
+		cmdq_thread_disable(cmdq, thread);
+		clk_disable(cmdq->clock);
+	}
+}
+
+static irqreturn_t cmdq_irq_handler(int irq, void *dev)
+{
+	struct cmdq *cmdq = dev;
+	unsigned long irq_status, flags = 0L;
+	int bit;
+
+	irq_status = readl(cmdq->base + CMDQ_CURR_IRQ_STATUS) & CMDQ_IRQ_MASK;
+	if (!(irq_status ^ CMDQ_IRQ_MASK))
+		return IRQ_NONE;
+
+	for_each_clear_bit(bit, &irq_status, fls(CMDQ_IRQ_MASK)) {
+		struct cmdq_thread *thread = &cmdq->thread[bit];
+
+		spin_lock_irqsave(&thread->chan->lock, flags);
+		cmdq_thread_irq_handler(cmdq, thread);
+		spin_unlock_irqrestore(&thread->chan->lock, flags);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int cmdq_suspend(struct device *dev)
+{
+	struct cmdq *cmdq = dev_get_drvdata(dev);
+	struct cmdq_thread *thread;
+	int i;
+	bool task_running = false;
+
+	cmdq->suspended = true;
+
+	for (i = 0; i < cmdq->thread_nr; i++) {
+		thread = &cmdq->thread[i];
+		if (!list_empty(&thread->task_busy_list)) {
+			task_running = true;
+			break;
+		}
+	}
+
+	if (task_running)
+		dev_warn(dev, "exist running task(s) in suspend\n");
+
+	clk_unprepare(cmdq->clock);
+
+	return 0;
+}
+
+static int cmdq_resume(struct device *dev)
+{
+	struct cmdq *cmdq = dev_get_drvdata(dev);
+
+	WARN_ON(clk_prepare(cmdq->clock) < 0);
+	cmdq->suspended = false;
+	return 0;
+}
+
+static int cmdq_remove(struct platform_device *pdev)
+{
+	struct cmdq *cmdq = platform_get_drvdata(pdev);
+
+	mbox_controller_unregister(&cmdq->mbox);
+	clk_unprepare(cmdq->clock);
+
+	if (cmdq->mbox.chans)
+		devm_kfree(&pdev->dev, cmdq->mbox.chans);
+
+	if (cmdq->thread)
+		devm_kfree(&pdev->dev, cmdq->thread);
+
+	devm_kfree(&pdev->dev, cmdq);
+
+	return 0;
+}
+
+static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data)
+{
+	struct cmdq_pkt *pkt = (struct cmdq_pkt *)data;
+	struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv;
+	struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev);
+	struct cmdq_task *task;
+	unsigned long curr_pa, end_pa;
+
+	/* Client should not flush new tasks if suspended. */
+	WARN_ON(cmdq->suspended);
+
+	task = kzalloc(sizeof(*task), GFP_ATOMIC);
+	task->cmdq = cmdq;
+	INIT_LIST_HEAD(&task->list_entry);
+	task->pa_base = pkt->pa_base;
+	task->thread = thread;
+	task->pkt = pkt;
+
+	if (list_empty(&thread->task_busy_list)) {
+		WARN_ON(clk_enable(cmdq->clock) < 0);
+		WARN_ON(cmdq_thread_reset(cmdq, thread) < 0);
+
+		writel(task->pa_base, thread->base + CMDQ_THR_CURR_ADDR);
+		writel(task->pa_base + pkt->cmd_buf_size,
+		       thread->base + CMDQ_THR_END_ADDR);
+		writel(thread->priority, thread->base + CMDQ_THR_PRIORITY);
+		writel(CMDQ_THR_IRQ_EN, thread->base + CMDQ_THR_IRQ_ENABLE);
+		writel(CMDQ_THR_ENABLED, thread->base + CMDQ_THR_ENABLE_TASK);
+	} else {
+		WARN_ON(cmdq_thread_suspend(cmdq, thread) < 0);
+		curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR);
+		end_pa = readl(thread->base + CMDQ_THR_END_ADDR);
+
+		/*
+		 * Atomic execution should remove the following wfe, i.e. only
+		 * wait event at first task, and prevent to pause when running.
+		 */
+		if (thread->atomic_exec) {
+			/* GCE is executing if command is not WFE */
+			if (!cmdq_thread_is_in_wfe(thread)) {
+				cmdq_thread_resume(thread);
+				cmdq_thread_wait_end(thread, end_pa);
+				WARN_ON(cmdq_thread_suspend(cmdq, thread) < 0);
+				/* set to this task directly */
+				writel(task->pa_base,
+				       thread->base + CMDQ_THR_CURR_ADDR);
+			} else {
+				cmdq_task_insert_into_thread(task);
+				cmdq_task_remove_wfe(task);
+				smp_mb(); /* modify jump before enable thread */
+			}
+		} else {
+			/* check boundary */
+			if (curr_pa == end_pa - CMDQ_INST_SIZE ||
+			    curr_pa == end_pa) {
+				/* set to this task directly */
+				writel(task->pa_base,
+				       thread->base + CMDQ_THR_CURR_ADDR);
+			} else {
+				cmdq_task_insert_into_thread(task);
+				smp_mb(); /* modify jump before enable thread */
+			}
+		}
+		writel(task->pa_base + pkt->cmd_buf_size,
+		       thread->base + CMDQ_THR_END_ADDR);
+		cmdq_thread_resume(thread);
+	}
+	list_move_tail(&task->list_entry, &thread->task_busy_list);
+
+	return 0;
+}
+
+static int cmdq_mbox_startup(struct mbox_chan *chan)
+{
+	return 0;
+}
+
+static void cmdq_mbox_shutdown(struct mbox_chan *chan)
+{
+}
+
+static const struct mbox_chan_ops cmdq_mbox_chan_ops = {
+	.send_data = cmdq_mbox_send_data,
+	.startup = cmdq_mbox_startup,
+	.shutdown = cmdq_mbox_shutdown,
+};
+
+static struct mbox_chan *cmdq_xlate(struct mbox_controller *mbox,
+		const struct of_phandle_args *sp)
+{
+	int ind = sp->args[0];
+	struct cmdq_thread *thread;
+
+	if (ind >= mbox->num_chans)
+		return ERR_PTR(-EINVAL);
+
+	thread = (struct cmdq_thread *)mbox->chans[ind].con_priv;
+	thread->priority = sp->args[1];
+	thread->atomic_exec = (sp->args[2] != 0);
+	thread->chan = &mbox->chans[ind];
+
+	return &mbox->chans[ind];
+}
+
+static int cmdq_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct cmdq *cmdq;
+	int err, i;
+
+	cmdq = devm_kzalloc(dev, sizeof(*cmdq), GFP_KERNEL);
+	if (!cmdq)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	cmdq->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(cmdq->base)) {
+		dev_err(dev, "failed to ioremap gce\n");
+		return PTR_ERR(cmdq->base);
+	}
+
+	cmdq->irq = platform_get_irq(pdev, 0);
+	if (!cmdq->irq) {
+		dev_err(dev, "failed to get irq\n");
+		return -EINVAL;
+	}
+	err = devm_request_irq(dev, cmdq->irq, cmdq_irq_handler, IRQF_SHARED,
+			       "mtk_cmdq", cmdq);
+	if (err < 0) {
+		dev_err(dev, "failed to register ISR (%d)\n", err);
+		return err;
+	}
+
+	dev_dbg(dev, "cmdq device: addr:0x%p, va:0x%p, irq:%d\n",
+		dev, cmdq->base, cmdq->irq);
+
+	cmdq->clock = devm_clk_get(dev, "gce");
+	if (IS_ERR(cmdq->clock)) {
+		dev_err(dev, "failed to get gce clk\n");
+		return PTR_ERR(cmdq->clock);
+	}
+
+	cmdq->thread_nr = (u32)(unsigned long)of_device_get_match_data(dev);
+	cmdq->mbox.dev = dev;
+	cmdq->mbox.chans = devm_kcalloc(dev, cmdq->thread_nr,
+					sizeof(*cmdq->mbox.chans), GFP_KERNEL);
+	if (!cmdq->mbox.chans)
+		return -ENOMEM;
+
+	cmdq->mbox.num_chans = cmdq->thread_nr;
+	cmdq->mbox.ops = &cmdq_mbox_chan_ops;
+	cmdq->mbox.of_xlate = cmdq_xlate;
+
+	/* make use of TXDONE_BY_ACK */
+	cmdq->mbox.txdone_irq = false;
+	cmdq->mbox.txdone_poll = false;
+
+	cmdq->thread = devm_kcalloc(dev, cmdq->thread_nr,
+					sizeof(*cmdq->thread), GFP_KERNEL);
+	if (!cmdq->thread)
+		return -ENOMEM;
+
+	for (i = 0; i < cmdq->thread_nr; i++) {
+		cmdq->thread[i].base = cmdq->base + CMDQ_THR_BASE +
+				CMDQ_THR_SIZE * i;
+		INIT_LIST_HEAD(&cmdq->thread[i].task_busy_list);
+		cmdq->mbox.chans[i].con_priv = (void *)&cmdq->thread[i];
+	}
+
+	err = mbox_controller_register(&cmdq->mbox);
+	if (err < 0) {
+		dev_err(dev, "failed to register mailbox: %d\n", err);
+		return err;
+	}
+
+	platform_set_drvdata(pdev, cmdq);
+	WARN_ON(clk_prepare(cmdq->clock) < 0);
+
+	cmdq_init(cmdq);
+
+	return 0;
+}
+
+static const struct dev_pm_ops cmdq_pm_ops = {
+	.suspend = cmdq_suspend,
+	.resume = cmdq_resume,
+};
+
+static const struct of_device_id cmdq_of_ids[] = {
+	{.compatible = "mediatek,mt8173-gce", .data = (void *)16},
+	{}
+};
+
+static struct platform_driver cmdq_drv = {
+	.probe = cmdq_probe,
+	.remove = cmdq_remove,
+	.driver = {
+		.name = "mtk_cmdq",
+		.pm = &cmdq_pm_ops,
+		.of_match_table = cmdq_of_ids,
+	}
+};
+
+static int __init cmdq_drv_init(void)
+{
+	return platform_driver_register(&cmdq_drv);
+}
+
+static void __exit cmdq_drv_exit(void)
+{
+	platform_driver_unregister(&cmdq_drv);
+}
+
+subsys_initcall(cmdq_drv_init);
+module_exit(cmdq_drv_exit);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
new file mode 100644
index 000000000000..ccb73422c2fa
--- /dev/null
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 MediaTek Inc.
+ *
+ */
+
+#ifndef __MTK_CMDQ_MAILBOX_H__
+#define __MTK_CMDQ_MAILBOX_H__
+
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define CMDQ_INST_SIZE			8 /* instruction is 64-bit */
+#define CMDQ_SUBSYS_SHIFT		16
+#define CMDQ_OP_CODE_SHIFT		24
+#define CMDQ_JUMP_PASS			CMDQ_INST_SIZE
+
+#define CMDQ_WFE_UPDATE			BIT(31)
+#define CMDQ_WFE_WAIT			BIT(15)
+#define CMDQ_WFE_WAIT_VALUE		0x1
+
+/*
+ * CMDQ_CODE_MASK:
+ *   set write mask
+ *   format: op mask
+ * CMDQ_CODE_WRITE:
+ *   write value into target register
+ *   format: op subsys address value
+ * CMDQ_CODE_JUMP:
+ *   jump by offset
+ *   format: op offset
+ * CMDQ_CODE_WFE:
+ *   wait for event and clear
+ *   it is just clear if no wait
+ *   format: [wait]  op event update:1 to_wait:1 wait:1
+ *           [clear] op event update:1 to_wait:0 wait:0
+ * CMDQ_CODE_EOC:
+ *   end of command
+ *   format: op irq_flag
+ */
+enum cmdq_code {
+	CMDQ_CODE_MASK = 0x02,
+	CMDQ_CODE_WRITE = 0x04,
+	CMDQ_CODE_JUMP = 0x10,
+	CMDQ_CODE_WFE = 0x20,
+	CMDQ_CODE_EOC = 0x40,
+};
+
+enum cmdq_cb_status {
+	CMDQ_CB_NORMAL = 0,
+	CMDQ_CB_ERROR
+};
+
+struct cmdq_cb_data {
+	enum cmdq_cb_status	sta;
+	void			*data;
+};
+
+typedef void (*cmdq_async_flush_cb)(struct cmdq_cb_data data);
+
+struct cmdq_task_cb {
+	cmdq_async_flush_cb	cb;
+	void			*data;
+};
+
+struct cmdq_pkt {
+	void			*va_base;
+	dma_addr_t		pa_base;
+	size_t			cmd_buf_size; /* command occupied size */
+	size_t			buf_size; /* real buffer size */
+	struct cmdq_task_cb	cb;
+	struct cmdq_task_cb	async_cb;
+	void			*cl;
+};
+
+#endif /* __MTK_CMDQ_MAILBOX_H__ */
-- 
cgit v1.2.3


From 7cca1ed0bb248b8d5768d17f5afe297a832d66c0 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 31 Jul 2018 20:25:00 +0200
Subject: netfilter: nf_osf: move nf_osf_fingers to non-uapi header file

All warnings (new ones prefixed by >>):

>> ./usr/include/linux/netfilter/nf_osf.h:73: userspace cannot reference function or variable defined in the kernel

Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      | 2 ++
 include/uapi/linux/netfilter/nf_osf.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index aee460fcbd31..3e455d6f94d5 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -25,6 +25,8 @@ enum osf_fmatch_states {
 	FMATCH_OPT_WRONG,
 };
 
+extern struct list_head nf_osf_fingers[2];
+
 struct nf_osf_finger {
 	struct rcu_head			rcu_head;
 	struct list_head		finger_entry;
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index cc2487ff74f6..3b93fbb9fc24 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -70,8 +70,6 @@ struct nf_osf_nlmsg {
 	struct tcphdr			tcp;
 };
 
-extern struct list_head nf_osf_fingers[2];
-
 /* Defines for IANA option kinds */
 enum iana_options {
 	OSFOPT_EOL = 0,		/* End of options */
-- 
cgit v1.2.3


From ddba40be59c9be4059288464f8e6f38fbba27495 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 31 Jul 2018 20:25:01 +0200
Subject: netfilter: nfnetlink_osf: rename nf_osf header file to nfnetlink_osf

The first client of the nf_osf.h userspace header is nft_osf, coming in
this batch, rename it to nfnetlink_osf.h as there are no userspace
clients for this yet, hence this looks consistent with other nfnetlink
subsystem.

Suggested-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h             |  44 -----------
 include/linux/netfilter/nfnetlink_osf.h      |  44 +++++++++++
 include/uapi/linux/netfilter/nf_osf.h        | 106 ---------------------------
 include/uapi/linux/netfilter/nfnetlink_osf.h | 106 +++++++++++++++++++++++++++
 include/uapi/linux/netfilter/xt_osf.h        |   2 +-
 net/netfilter/nfnetlink_osf.c                |   2 +-
 net/netfilter/nft_osf.c                      |   2 +-
 7 files changed, 153 insertions(+), 153 deletions(-)
 delete mode 100644 include/linux/netfilter/nf_osf.h
 create mode 100644 include/linux/netfilter/nfnetlink_osf.h
 delete mode 100644 include/uapi/linux/netfilter/nf_osf.h
 create mode 100644 include/uapi/linux/netfilter/nfnetlink_osf.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
deleted file mode 100644
index 3e455d6f94d5..000000000000
--- a/include/linux/netfilter/nf_osf.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFOSF_H
-#define _NFOSF_H
-
-#include <uapi/linux/netfilter/nf_osf.h>
-
-/* Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum nf_osf_window_size_options {
-	OSF_WSS_PLAIN   = 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
-
-enum osf_fmatch_states {
-	/* Packet does not match the fingerprint */
-	FMATCH_WRONG = 0,
-	/* Packet matches the fingerprint */
-	FMATCH_OK,
-	/* Options do not match the fingerprint, but header does */
-	FMATCH_OPT_WRONG,
-};
-
-extern struct list_head nf_osf_fingers[2];
-
-struct nf_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct nf_osf_user_finger	finger;
-};
-
-bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
-		  int hooknum, struct net_device *in, struct net_device *out,
-		  const struct nf_osf_info *info, struct net *net,
-		  const struct list_head *nf_osf_fingers);
-
-const char *nf_osf_find(const struct sk_buff *skb,
-                        const struct list_head *nf_osf_fingers);
-
-#endif /* _NFOSF_H */
diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
new file mode 100644
index 000000000000..a7311bc03d3a
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFOSF_H
+#define _NFOSF_H
+
+#include <uapi/linux/netfilter/nfnetlink_osf.h>
+
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN   = 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+extern struct list_head nf_osf_fingers[2];
+
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
+bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+		  int hooknum, struct net_device *in, struct net_device *out,
+		  const struct nf_osf_info *info, struct net *net,
+		  const struct list_head *nf_osf_fingers);
+
+const char *nf_osf_find(const struct sk_buff *skb,
+                        const struct list_head *nf_osf_fingers);
+
+#endif /* _NFOSF_H */
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
deleted file mode 100644
index 3b93fbb9fc24..000000000000
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef _NF_OSF_H
-#define _NF_OSF_H
-
-#include <linux/types.h>
-
-#define MAXGENRELEN	32
-
-#define NF_OSF_GENRE	(1 << 0)
-#define NF_OSF_TTL	(1 << 1)
-#define NF_OSF_LOG	(1 << 2)
-#define NF_OSF_INVERT	(1 << 3)
-
-#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
-#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
-#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
-
-#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
-
-/* Check if ip TTL is less than fingerprint one */
-#define NF_OSF_TTL_LESS			1
-
-/* Do not compare ip and fingerprint TTL at all */
-#define NF_OSF_TTL_NOCHECK		2
-
-#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
-				 NF_OSF_LOG | NF_OSF_INVERT)
-/* Wildcard MSS (kind of).
- * It is used to implement a state machine for the different wildcard values
- * of the MSS and window sizes.
- */
-struct nf_osf_wc {
-	__u32	wc;
-	__u32	val;
-};
-
-/* This struct represents IANA options
- * http://www.iana.org/assignments/tcp-parameters
- */
-struct nf_osf_opt {
-	__u16			kind, length;
-	struct nf_osf_wc	wc;
-};
-
-struct nf_osf_info {
-	char	genre[MAXGENRELEN];
-	__u32	len;
-	__u32	flags;
-	__u32	loglevel;
-	__u32	ttl;
-};
-
-struct nf_osf_user_finger {
-	struct nf_osf_wc	wss;
-
-	__u8	ttl, df;
-	__u16	ss, mss;
-	__u16	opt_num;
-
-	char	genre[MAXGENRELEN];
-	char	version[MAXGENRELEN];
-	char	subtype[MAXGENRELEN];
-
-	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
-	struct nf_osf_opt	opt[MAX_IPOPTLEN];
-};
-
-struct nf_osf_nlmsg {
-	struct nf_osf_user_finger	f;
-	struct iphdr			ip;
-	struct tcphdr			tcp;
-};
-
-/* Defines for IANA option kinds */
-enum iana_options {
-	OSFOPT_EOL = 0,		/* End of options */
-	OSFOPT_NOP,		/* NOP */
-	OSFOPT_MSS,		/* Maximum segment size */
-	OSFOPT_WSO,		/* Window scale option */
-	OSFOPT_SACKP,		/* SACK permitted */
-	OSFOPT_SACK,		/* SACK */
-	OSFOPT_ECHO,
-	OSFOPT_ECHOREPLY,
-	OSFOPT_TS,		/* Timestamp option */
-	OSFOPT_POCP,		/* Partial Order Connection Permitted */
-	OSFOPT_POSP,		/* Partial Order Service Profile */
-
-	/* Others are not used in the current OSF */
-	OSFOPT_EMPTY = 255,
-};
-
-enum nf_osf_attr_type {
-	OSF_ATTR_UNSPEC,
-	OSF_ATTR_FINGER,
-	OSF_ATTR_MAX,
-};
-
-/*
- * Add/remove fingerprint from the kernel.
- */
-enum nf_osf_msg_types {
-	OSF_MSG_ADD,
-	OSF_MSG_REMOVE,
-	OSF_MSG_MAX,
-};
-
-#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
new file mode 100644
index 000000000000..3b93fbb9fc24
--- /dev/null
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -0,0 +1,106 @@
+#ifndef _NF_OSF_H
+#define _NF_OSF_H
+
+#include <linux/types.h>
+
+#define MAXGENRELEN	32
+
+#define NF_OSF_GENRE	(1 << 0)
+#define NF_OSF_TTL	(1 << 1)
+#define NF_OSF_LOG	(1 << 2)
+#define NF_OSF_INVERT	(1 << 3)
+
+#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
+#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
+#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
+
+#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
+
+/* Check if ip TTL is less than fingerprint one */
+#define NF_OSF_TTL_LESS			1
+
+/* Do not compare ip and fingerprint TTL at all */
+#define NF_OSF_TTL_NOCHECK		2
+
+#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
+				 NF_OSF_LOG | NF_OSF_INVERT)
+/* Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct nf_osf_wc {
+	__u32	wc;
+	__u32	val;
+};
+
+/* This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct nf_osf_opt {
+	__u16			kind, length;
+	struct nf_osf_wc	wc;
+};
+
+struct nf_osf_info {
+	char	genre[MAXGENRELEN];
+	__u32	len;
+	__u32	flags;
+	__u32	loglevel;
+	__u32	ttl;
+};
+
+struct nf_osf_user_finger {
+	struct nf_osf_wc	wss;
+
+	__u8	ttl, df;
+	__u16	ss, mss;
+	__u16	opt_num;
+
+	char	genre[MAXGENRELEN];
+	char	version[MAXGENRELEN];
+	char	subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct nf_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct nf_osf_nlmsg {
+	struct nf_osf_user_finger	f;
+	struct iphdr			ip;
+	struct tcphdr			tcp;
+};
+
+/* Defines for IANA option kinds */
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP,		/* NOP */
+	OSFOPT_MSS,		/* Maximum segment size */
+	OSFOPT_WSO,		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+enum nf_osf_attr_type {
+	OSF_ATTR_UNSPEC,
+	OSF_ATTR_FINGER,
+	OSF_ATTR_MAX,
+};
+
+/*
+ * Add/remove fingerprint from the kernel.
+ */
+enum nf_osf_msg_types {
+	OSF_MSG_ADD,
+	OSF_MSG_REMOVE,
+	OSF_MSG_MAX,
+};
+
+#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index a90e90c27cef..c56c59605c2b 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -23,7 +23,7 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 #define XT_OSF_GENRE		NF_OSF_GENRE
 #define XT_OSF_INVERT		NF_OSF_INVERT
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index ba0fa11869ce..f9dba62c450f 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -18,7 +18,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/x_tables.h>
 #include <net/netfilter/nf_log.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 /*
  * Indexed by dont-fragment bit.
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index bdacc4cffba4..9b2f3de7be4f 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -2,7 +2,7 @@
 #include <net/tcp.h>
 
 #include <net/netfilter/nf_tables.h>
-#include <linux/netfilter/nf_osf.h>
+#include <linux/netfilter/nfnetlink_osf.h>
 
 #define OSF_GENRE_SIZE		32
 
-- 
cgit v1.2.3


From 94276fa8a2a4c08ccb2e9d55e88b95dc972ccea3 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 3 Aug 2018 13:36:13 +0200
Subject: netfilter: bridge: Expose nf_tables bridge hook priorities through
 uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Netfilter exposes standard hook priorities in case of ipv4, ipv6 and
arp but not in case of bridge.

This patch exposes the hook priority values of the bridge family (which are
different from the formerly mentioned) via uapi so that they can be used by
user-space applications just like the others.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h      | 11 -----------
 include/uapi/linux/netfilter_bridge.h | 11 +++++++++++
 net/bridge/br_netfilter_hooks.c       |  1 +
 net/bridge/netfilter/ebtable_filter.c |  1 +
 net/bridge/netfilter/ebtable_nat.c    |  1 +
 5 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index b671fdfd212b..fa0686500970 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -5,17 +5,6 @@
 #include <uapi/linux/netfilter_bridge.h>
 #include <linux/skbuff.h>
 
-enum nf_br_hook_priorities {
-	NF_BR_PRI_FIRST = INT_MIN,
-	NF_BR_PRI_NAT_DST_BRIDGED = -300,
-	NF_BR_PRI_FILTER_BRIDGED = -200,
-	NF_BR_PRI_BRNF = 0,
-	NF_BR_PRI_NAT_DST_OTHER = 100,
-	NF_BR_PRI_FILTER_OTHER = 200,
-	NF_BR_PRI_NAT_SRC = 300,
-	NF_BR_PRI_LAST = INT_MAX,
-};
-
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 
 int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h
index 12fb77633f83..156ccd089df1 100644
--- a/include/uapi/linux/netfilter_bridge.h
+++ b/include/uapi/linux/netfilter_bridge.h
@@ -26,4 +26,15 @@
 #define NF_BR_BROUTING		5
 #define NF_BR_NUMHOOKS		6
 
+enum nf_br_hook_priorities {
+	NF_BR_PRI_FIRST = INT_MIN,
+	NF_BR_PRI_NAT_DST_BRIDGED = -300,
+	NF_BR_PRI_FILTER_BRIDGED = -200,
+	NF_BR_PRI_BRNF = 0,
+	NF_BR_PRI_NAT_DST_OTHER = 100,
+	NF_BR_PRI_FILTER_OTHER = 200,
+	NF_BR_PRI_NAT_SRC = 300,
+	NF_BR_PRI_LAST = INT_MAX,
+};
+
 #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 9b16eaf33819..6e0dc6bcd32a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -26,6 +26,7 @@
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
 #include <linux/netfilter_bridge.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..550324c516ee 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/module.h>
 
 #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..c0fb3ca518af 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
 #include <linux/module.h>
 
 #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
-- 
cgit v1.2.3


From c2b6d621c4ffe9936adf7a55c8b1c769672c306f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 28 Jun 2018 15:53:17 -0400
Subject: new primitive: discard_new_inode()

	We don't want open-by-handle picking half-set-up in-core
struct inode from e.g. mkdir() having failed halfway through.
In other words, we don't want such inodes returned by iget_locked()
on their way to extinction.  However, we can't just have them
unhashed - otherwise open-by-handle immediately *after* that would've
ended up creating a new in-core inode over the on-disk one that
is in process of being freed right under us.

	Solution: new flag (I_CREATING) set by insert_inode_locked() and
removed by unlock_new_inode() and a new primitive (discard_new_inode())
to be used by such halfway-through-setup failure exits instead of
unlock_new_inode() / iput() combinations.  That primitive unlocks new
inode, but leaves I_CREATING in place.

	iget_locked() treats finding an I_CREATING inode as failure
(-ESTALE, once we sort out the error propagation).
	insert_inode_locked() treats the same as instant -EBUSY.
	ilookup() treats those as icache miss.

[Fix by Dan Carpenter <dan.carpenter@oracle.com> folded in]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c        |  2 +-
 fs/inode.c         | 45 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/fs.h |  6 +++++-
 3 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index a7d9e7a4c283..11b753d29409 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1892,7 +1892,7 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	spin_lock(&inode->i_lock);
 	__d_instantiate(entry, inode);
 	WARN_ON(!(inode->i_state & I_NEW));
-	inode->i_state &= ~I_NEW;
+	inode->i_state &= ~I_NEW & ~I_CREATING;
 	smp_mb();
 	wake_up_bit(&inode->i_state, __I_NEW);
 	spin_unlock(&inode->i_lock);
diff --git a/fs/inode.c b/fs/inode.c
index 2c300e981796..6cd2e7ba9f4d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -804,6 +804,10 @@ repeat:
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
+		if (unlikely(inode->i_state & I_CREATING)) {
+			spin_unlock(&inode->i_lock);
+			return ERR_PTR(-ESTALE);
+		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		return inode;
@@ -831,6 +835,10 @@ repeat:
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
+		if (unlikely(inode->i_state & I_CREATING)) {
+			spin_unlock(&inode->i_lock);
+			return ERR_PTR(-ESTALE);
+		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		return inode;
@@ -961,13 +969,26 @@ void unlock_new_inode(struct inode *inode)
 	lockdep_annotate_inode_mutex_key(inode);
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
-	inode->i_state &= ~I_NEW;
+	inode->i_state &= ~I_NEW & ~I_CREATING;
 	smp_mb();
 	wake_up_bit(&inode->i_state, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
 
+void discard_new_inode(struct inode *inode)
+{
+	lockdep_annotate_inode_mutex_key(inode);
+	spin_lock(&inode->i_lock);
+	WARN_ON(!(inode->i_state & I_NEW));
+	inode->i_state &= ~I_NEW;
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
+	iput(inode);
+}
+EXPORT_SYMBOL(discard_new_inode);
+
 /**
  * lock_two_nondirectories - take two i_mutexes on non-directory objects
  *
@@ -1039,6 +1060,8 @@ again:
 		 * Use the old inode instead of the preallocated one.
 		 */
 		spin_unlock(&inode_hash_lock);
+		if (IS_ERR(old))
+			return NULL;
 		wait_on_inode(old);
 		if (unlikely(inode_unhashed(old))) {
 			iput(old);
@@ -1128,6 +1151,8 @@ again:
 	inode = find_inode_fast(sb, head, ino);
 	spin_unlock(&inode_hash_lock);
 	if (inode) {
+		if (IS_ERR(inode))
+			return NULL;
 		wait_on_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
@@ -1165,6 +1190,8 @@ again:
 		 */
 		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
+		if (IS_ERR(old))
+			return NULL;
 		inode = old;
 		wait_on_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
@@ -1282,7 +1309,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 	inode = find_inode(sb, head, test, data);
 	spin_unlock(&inode_hash_lock);
 
-	return inode;
+	return IS_ERR(inode) ? NULL : inode;
 }
 EXPORT_SYMBOL(ilookup5_nowait);
 
@@ -1338,6 +1365,8 @@ again:
 	spin_unlock(&inode_hash_lock);
 
 	if (inode) {
+		if (IS_ERR(inode))
+			return NULL;
 		wait_on_inode(inode);
 		if (unlikely(inode_unhashed(inode))) {
 			iput(inode);
@@ -1421,12 +1450,17 @@ int insert_inode_locked(struct inode *inode)
 		}
 		if (likely(!old)) {
 			spin_lock(&inode->i_lock);
-			inode->i_state |= I_NEW;
+			inode->i_state |= I_NEW | I_CREATING;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
+		if (unlikely(old->i_state & I_CREATING)) {
+			spin_unlock(&old->i_lock);
+			spin_unlock(&inode_hash_lock);
+			return -EBUSY;
+		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_hash_lock);
@@ -1443,7 +1477,10 @@ EXPORT_SYMBOL(insert_inode_locked);
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
-	struct inode *old = inode_insert5(inode, hashval, test, NULL, data);
+	struct inode *old;
+
+	inode->i_state |= I_CREATING;
+	old = inode_insert5(inode, hashval, test, NULL, data);
 
 	if (old != inode) {
 		iput(old);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5c91108846db..a42600565925 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2016,6 +2016,8 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
  * I_OVL_INUSE		Used by overlayfs to get exclusive ownership on upper
  *			and work dirs among overlayfs mounts.
  *
+ * I_CREATING		New object's inode in the middle of setting up.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -2036,7 +2038,8 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 #define __I_DIRTY_TIME_EXPIRED	12
 #define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
 #define I_WB_SWITCH		(1 << 13)
-#define I_OVL_INUSE			(1 << 14)
+#define I_OVL_INUSE		(1 << 14)
+#define I_CREATING		(1 << 15)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@ -2919,6 +2922,7 @@ extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
 static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
 #endif
 extern void unlock_new_inode(struct inode *);
+extern void discard_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
 extern void evict_inodes(struct super_block *sb);
 
-- 
cgit v1.2.3


From 5bef915104f32c9d0bb5df6e86a98e31cb524e9a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Jun 2018 19:36:57 -0400
Subject: new helper: inode_fake_hash()

open-coded in a quite a few places...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/inode.c     |  2 +-
 fs/jfs/jfs_imap.c  |  8 +-------
 fs/jfs/super.c     |  2 +-
 fs/xfs/xfs_iops.c  |  2 +-
 include/linux/fs.h | 11 +++++++++++
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2a16111d312f..a2dfa1b2a89c 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -541,7 +541,7 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 	HFS_I(inode)->rsrc_inode = dir;
 	HFS_I(dir)->rsrc_inode = inode;
 	igrab(dir);
-	hlist_add_fake(&inode->i_hash);
+	inode_fake_hash(inode);
 	mark_inode_dirty(inode);
 	dont_mount(dentry);
 out:
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f36ef68905a7..93e8c590ff5c 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -491,13 +491,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	/* release the page */
 	release_metapage(mp);
 
-	/*
-	 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
-	 * want special inodes in the fileset inode space, we make them
-	 * appear hashed, but do not put on any lists.  hlist_del()
-	 * will work fine and require no locking.
-	 */
-	hlist_add_fake(&ip->i_hash);
+	inode_fake_hash(ip);
 
 	return (ip);
 }
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1b9264fd54b6..5403ece57dba 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -581,7 +581,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	inode->i_ino = 0;
 	inode->i_size = i_size_read(sb->s_bdev->bd_inode);
 	inode->i_mapping->a_ops = &jfs_metapage_aops;
-	hlist_add_fake(&inode->i_hash);
+	inode_fake_hash(inode);
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 
 	sbi->direct_inode = inode;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 0fa29f39d658..3a75de777843 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1253,7 +1253,7 @@ xfs_setup_inode(
 
 	inode_sb_list_add(inode);
 	/* make the inode look hashed for the writeback code */
-	hlist_add_fake(&inode->i_hash);
+	inode_fake_hash(inode);
 
 	inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
 	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a42600565925..43941e230e2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -684,6 +684,17 @@ static inline int inode_unhashed(struct inode *inode)
 	return hlist_unhashed(&inode->i_hash);
 }
 
+/*
+ * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+ * want special inodes in the fileset inode space, we make them
+ * appear hashed, but do not put on any lists.  hlist_del()
+ * will work fine and require no locking.
+ */
+static inline void inode_fake_hash(struct inode *inode)
+{
+	hlist_add_fake(&inode->i_hash);
+}
+
 /*
  * inode->i_mutex nesting subclasses for the lock validator:
  *
-- 
cgit v1.2.3


From 91305f2812624c0cf7ccbb44133b66d3b24676e4 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 1 Aug 2018 18:05:54 +0800
Subject: ptp_qoriq: support automatic configuration for ptp timer

This patch is to support automatic configuration for ptp timer.
If required ptp dts properties are not provided, driver could
try to calculate a set of default configurations to initialize
the ptp timer. This makes the driver work for many boards which
don't have the required ptp dts properties in current kernel.
Also the users could set dts properties by themselves according
to their requirement.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_qoriq.c       | 111 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/fsl/ptp_qoriq.h |   6 ++-
 2 files changed, 113 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index a14c317b5a38..095c18532dc7 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -29,6 +29,7 @@
 #include <linux/of_platform.h>
 #include <linux/timex.h>
 #include <linux/slab.h>
+#include <linux/clk.h>
 
 #include <linux/fsl/ptp_qoriq.h>
 
@@ -317,6 +318,105 @@ static const struct ptp_clock_info ptp_qoriq_caps = {
 	.enable		= ptp_qoriq_enable,
 };
 
+/**
+ * qoriq_ptp_nominal_freq - calculate nominal frequency according to
+ *			    reference clock frequency
+ *
+ * @clk_src: reference clock frequency
+ *
+ * The nominal frequency is the desired clock frequency.
+ * It should be less than the reference clock frequency.
+ * It should be a factor of 1000MHz.
+ *
+ * Return the nominal frequency
+ */
+static u32 qoriq_ptp_nominal_freq(u32 clk_src)
+{
+	u32 remainder = 0;
+
+	clk_src /= 1000000;
+	remainder = clk_src % 100;
+	if (remainder) {
+		clk_src -= remainder;
+		clk_src += 100;
+	}
+
+	do {
+		clk_src -= 100;
+
+	} while (1000 % clk_src);
+
+	return clk_src * 1000000;
+}
+
+/**
+ * qoriq_ptp_auto_config - calculate a set of default configurations
+ *
+ * @qoriq_ptp: pointer to qoriq_ptp
+ * @node: pointer to device_node
+ *
+ * If below dts properties are not provided, this function will be
+ * called to calculate a set of default configurations for them.
+ *   "fsl,tclk-period"
+ *   "fsl,tmr-prsc"
+ *   "fsl,tmr-add"
+ *   "fsl,tmr-fiper1"
+ *   "fsl,tmr-fiper2"
+ *   "fsl,max-adj"
+ *
+ * Return 0 if success
+ */
+static int qoriq_ptp_auto_config(struct qoriq_ptp *qoriq_ptp,
+				 struct device_node *node)
+{
+	struct clk *clk;
+	u64 freq_comp;
+	u64 max_adj;
+	u32 nominal_freq;
+	u32 clk_src = 0;
+
+	qoriq_ptp->cksel = DEFAULT_CKSEL;
+
+	clk = of_clk_get(node, 0);
+	if (!IS_ERR(clk)) {
+		clk_src = clk_get_rate(clk);
+		clk_put(clk);
+	}
+
+	if (clk_src <= 100000000UL) {
+		pr_err("error reference clock value, or lower than 100MHz\n");
+		return -EINVAL;
+	}
+
+	nominal_freq = qoriq_ptp_nominal_freq(clk_src);
+	if (!nominal_freq)
+		return -EINVAL;
+
+	qoriq_ptp->tclk_period = 1000000000UL / nominal_freq;
+	qoriq_ptp->tmr_prsc = DEFAULT_TMR_PRSC;
+
+	/* Calculate initial frequency compensation value for TMR_ADD register.
+	 * freq_comp = ceil(2^32 / freq_ratio)
+	 * freq_ratio = reference_clock_freq / nominal_freq
+	 */
+	freq_comp = ((u64)1 << 32) * nominal_freq;
+	if (do_div(freq_comp, clk_src))
+		freq_comp++;
+
+	qoriq_ptp->tmr_add = freq_comp;
+	qoriq_ptp->tmr_fiper1 = DEFAULT_FIPER1_PERIOD - qoriq_ptp->tclk_period;
+	qoriq_ptp->tmr_fiper2 = DEFAULT_FIPER2_PERIOD - qoriq_ptp->tclk_period;
+
+	/* max_adj = 1000000000 * (freq_ratio - 1.0) - 1
+	 * freq_ratio = reference_clock_freq / nominal_freq
+	 */
+	max_adj = 1000000000ULL * (clk_src - nominal_freq);
+	max_adj = max_adj / nominal_freq - 1;
+	qoriq_ptp->caps.max_adj = max_adj;
+
+	return 0;
+}
+
 static int qoriq_ptp_probe(struct platform_device *dev)
 {
 	struct device_node *node = dev->dev.of_node;
@@ -332,7 +432,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 	if (!qoriq_ptp)
 		goto no_memory;
 
-	err = -ENODEV;
+	err = -EINVAL;
 
 	qoriq_ptp->caps = ptp_qoriq_caps;
 
@@ -351,10 +451,14 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 				 "fsl,tmr-fiper2", &qoriq_ptp->tmr_fiper2) ||
 	    of_property_read_u32(node,
 				 "fsl,max-adj", &qoriq_ptp->caps.max_adj)) {
-		pr_err("device tree node missing required elements\n");
-		goto no_node;
+		pr_warn("device tree node missing required elements, try automatic configuration\n");
+
+		if (qoriq_ptp_auto_config(qoriq_ptp, node))
+			goto no_config;
 	}
 
+	err = -ENODEV;
+
 	qoriq_ptp->irq = platform_get_irq(dev, 0);
 
 	if (qoriq_ptp->irq < 0) {
@@ -436,6 +540,7 @@ no_ioremap:
 	release_resource(qoriq_ptp->rsrc);
 no_resource:
 	free_irq(qoriq_ptp->irq, qoriq_ptp);
+no_config:
 no_node:
 	kfree(qoriq_ptp);
 no_memory:
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
index dc3dac40f069..c1f003aadcce 100644
--- a/include/linux/fsl/ptp_qoriq.h
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -127,9 +127,13 @@ struct qoriq_ptp_registers {
 
 
 #define DRIVER		"ptp_qoriq"
-#define DEFAULT_CKSEL	1
 #define N_EXT_TS	2
 
+#define DEFAULT_CKSEL		1
+#define DEFAULT_TMR_PRSC	2
+#define DEFAULT_FIPER1_PERIOD	1000000000
+#define DEFAULT_FIPER2_PERIOD	100000
+
 struct qoriq_ptp {
 	void __iomem *base;
 	struct qoriq_ptp_registers regs;
-- 
cgit v1.2.3


From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Thu, 2 Aug 2018 23:34:38 +0000
Subject: net: modify skb_rbtree_purge to return the truesize of all purged
 skbs.

Tested: see the next patch is the series.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fd3cb1b247df..47848367c816 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2585,7 +2585,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-void skb_rbtree_purge(struct rb_root *root);
+unsigned int skb_rbtree_purge(struct rb_root *root);
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 51b0a9126e12..8d574a88125d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge);
 /**
  *	skb_rbtree_purge - empty a skb rbtree
  *	@root: root of the rbtree to empty
+ *	Return value: the sum of truesizes of all purged skbs.
  *
  *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
  *	the list and one reference dropped. This function does not take
  *	any lock. Synchronization should be handled by the caller (e.g., TCP
  *	out-of-order queue is protected by the socket lock).
  */
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
 {
 	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
 
 	while (p) {
 		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, root);
+		sum += skb->truesize;
 		kfree_skb(skb);
 	}
+	return sum;
 }
 
 /**
-- 
cgit v1.2.3


From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Thu, 2 Aug 2018 23:34:39 +0000
Subject: ip: use rb trees for IP frag queue.

Similar to TCP OOO RX queue, it makes sense to use rb trees to store
IP fragments, so that OOO fragments are inserted faster.

Tested:

- a follow-up patch contains a rather comprehensive ip defrag
  self-test (functional)
- ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`:
    netstat --statistics
    Ip:
        282078937 total packets received
        0 forwarded
        0 incoming packets discarded
        946760 incoming packets delivered
        18743456 requests sent out
        101 fragments dropped after timeout
        282077129 reassemblies required
        944952 packets reassembled ok
        262734239 packet reassembles failed
   (The numbers/stats above are somewhat better re:
    reassemblies vs a kernel without this patchset. More
    comprehensive performance testing TBD).

Reported-by: Jann Horn <jannh@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                  |   9 +-
 include/net/inet_frag.h                 |   3 +-
 net/ipv4/inet_fragment.c                |  16 +--
 net/ipv4/ip_fragment.c                  | 182 ++++++++++++++++++--------------
 net/ipv6/netfilter/nf_conntrack_reasm.c |   1 +
 net/ipv6/reassembly.c                   |   1 +
 6 files changed, 121 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 47848367c816..7ebdf158a795 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,16 @@ struct sk_buff {
 				 * UDP receive path is one user.
 				 */
 				unsigned long		dev_scratch;
-				int			ip_defrag_offset;
 			};
 		};
-		struct rb_node		rbnode; /* used in netem & tcp stack */
+		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
 		struct list_head	list;
 	};
-	struct sock		*sk;
+
+	union {
+		struct sock		*sk;
+		int			ip_defrag_offset;
+	};
 
 	union {
 		ktime_t		tstamp;
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index f4272a29dc44..b86d14528188 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -75,7 +75,8 @@ struct inet_frag_queue {
 	struct timer_list	timer;
 	spinlock_t		lock;
 	refcount_t		refcnt;
-	struct sk_buff		*fragments;
+	struct sk_buff		*fragments;  /* Used in IPv6. */
+	struct rb_root		rb_fragments; /* Used in IPv4. */
 	struct sk_buff		*fragments_tail;
 	ktime_t			stamp;
 	int			len;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ccd140e4082d..6d258a5669e7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 	fp = q->fragments;
 	nf = q->net;
 	f = nf->f;
-	while (fp) {
-		struct sk_buff *xp = fp->next;
-
-		sum_truesize += fp->truesize;
-		kfree_skb(fp);
-		fp = xp;
+	if (fp) {
+		do {
+			struct sk_buff *xp = fp->next;
+
+			sum_truesize += fp->truesize;
+			kfree_skb(fp);
+			fp = xp;
+		} while (fp);
+	} else {
+		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
 	}
 	sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 960bf5eab59f..0e8f8de77e71 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	const struct iphdr *iph;
-	struct sk_buff *head;
+	struct sk_buff *head = NULL;
 	struct net *net;
 	struct ipq *qp;
 	int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
 
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-
-	head = qp->q.fragments;
-
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+	if (!qp->q.flags & INET_FRAG_FIRST_IN)
 		goto out;
 
+	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
+	 * pull the head out of the tree in order to be able to
+	 * deal with head->dev.
+	 */
+	if (qp->q.fragments) {
+		head = qp->q.fragments;
+		qp->q.fragments = head->next;
+	} else {
+		head = skb_rb_first(&qp->q.rb_fragments);
+		if (!head)
+			goto out;
+		rb_erase(&head->rbnode, &qp->q.rb_fragments);
+		memset(&head->rbnode, 0, sizeof(head->rbnode));
+		barrier();
+	}
+	if (head == qp->q.fragments_tail)
+		qp->q.fragments_tail = NULL;
+
+	sub_frag_mem_limit(qp->q.net, head->truesize);
+
 	head->dev = dev_get_by_index_rcu(net, qp->iif);
 	if (!head->dev)
 		goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
 	    (skb_rtable(head)->rt_type != RTN_LOCAL))
 		goto out;
 
-	skb_get(head);
 	spin_unlock(&qp->q.lock);
 	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
-	kfree_skb(head);
 	goto out_rcu_unlock;
 
 out:
 	spin_unlock(&qp->q.lock);
 out_rcu_unlock:
 	rcu_read_unlock();
+	if (head)
+		kfree_skb(head);
 	ipq_put(qp);
 }
 
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
 	end = atomic_inc_return(&peer->rid);
 	qp->rid = end;
 
-	rc = qp->q.fragments && (end - start) > max;
+	rc = qp->q.fragments_tail && (end - start) > max;
 
 	if (rc) {
 		struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
 
 static int ip_frag_reinit(struct ipq *qp)
 {
-	struct sk_buff *fp;
 	unsigned int sum_truesize = 0;
 
 	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
 		return -ETIMEDOUT;
 	}
 
-	fp = qp->q.fragments;
-	do {
-		struct sk_buff *xp = fp->next;
-
-		sum_truesize += fp->truesize;
-		kfree_skb(fp);
-		fp = xp;
-	} while (fp);
+	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
 	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
 	qp->q.meat = 0;
 	qp->q.fragments = NULL;
+	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
 	qp->iif = 0;
 	qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
-	struct sk_buff *prev, *next;
+	struct rb_node **rbn, *parent;
+	struct sk_buff *skb1;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (err)
 		goto err;
 
-	/* Find out which fragments are in front and at the back of us
-	 * in the chain of fragments so far.  We must know where to put
-	 * this fragment, right?
-	 */
-	prev = qp->q.fragments_tail;
-	if (!prev || prev->ip_defrag_offset < offset) {
-		next = NULL;
-		goto found;
-	}
-	prev = NULL;
-	for (next = qp->q.fragments; next != NULL; next = next->next) {
-		if (next->ip_defrag_offset >= offset)
-			break;	/* bingo! */
-		prev = next;
-	}
+	/* Note : skb->rbnode and skb->dev share the same location. */
+	dev = skb->dev;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
 
-found:
 	/* RFC5722, Section 4, amended by Errata ID : 3089
 	 *                          When reassembling an IPv6 datagram, if
 	 *   one or more its constituent fragments is determined to be an
 	 *   overlapping fragment, the entire datagram (and any constituent
 	 *   fragments) MUST be silently discarded.
 	 *
-	 * We do the same here for IPv4.
+	 * We do the same here for IPv4 (and increment an snmp counter).
 	 */
 
-	/* Is there an overlap with the previous fragment? */
-	if (prev &&
-	    (prev->ip_defrag_offset + prev->len) > offset)
-		goto discard_qp;
-
-	/* Is there an overlap with the next fragment? */
-	if (next && next->ip_defrag_offset < end)
-		goto discard_qp;
+	/* Find out where to put this fragment.  */
+	skb1 = qp->q.fragments_tail;
+	if (!skb1) {
+		/* This is the first fragment we've received. */
+		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+		qp->q.fragments_tail = skb;
+	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+		/* This is the common/special case: skb goes to the end. */
+		/* Detect and discard overlaps. */
+		if (offset < (skb1->ip_defrag_offset + skb1->len))
+			goto discard_qp;
+		/* Insert after skb1. */
+		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+		qp->q.fragments_tail = skb;
+	} else {
+		/* Binary search. Note that skb can become the first fragment, but
+		 * not the last (covered above). */
+		rbn = &qp->q.rb_fragments.rb_node;
+		do {
+			parent = *rbn;
+			skb1 = rb_to_skb(parent);
+			if (end <= skb1->ip_defrag_offset)
+				rbn = &parent->rb_left;
+			else if (offset >= skb1->ip_defrag_offset + skb1->len)
+				rbn = &parent->rb_right;
+			else /* Found an overlap with skb1. */
+				goto discard_qp;
+		} while (*rbn);
+		/* Here we have parent properly set, and rbn pointing to
+		 * one of its NULL left/right children. Insert skb. */
+		rb_link_node(&skb->rbnode, parent, rbn);
+	}
+	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
-	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
-	dev = skb->dev;
 	if (dev)
 		qp->iif = dev->ifindex;
-	/* Makes sure compiler wont do silly aliasing games */
-	barrier();
 	skb->ip_defrag_offset = offset;
 
-	/* Insert this fragment in the chain of fragments. */
-	skb->next = next;
-	if (!next)
-		qp->q.fragments_tail = skb;
-	if (prev)
-		prev->next = skb;
-	else
-		qp->q.fragments = skb;
-
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
@@ -414,7 +425,7 @@ found:
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, prev, dev);
+		err = ip_frag_reasm(qp, skb, dev);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -431,15 +442,15 @@ err:
 	return err;
 }
 
-
 /* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			 struct net_device *dev)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
-	struct sk_buff *fp, *head = qp->q.fragments;
+	struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+	struct sk_buff **nextp; /* To build frag_list. */
+	struct rb_node *rbn;
 	int len;
 	int ihlen;
 	int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		goto out_fail;
 	}
 	/* Make the one we just received the head. */
-	if (prev) {
-		head = prev->next;
-		fp = skb_clone(head, GFP_ATOMIC);
+	if (head != skb) {
+		fp = skb_clone(skb, GFP_ATOMIC);
 		if (!fp)
 			goto out_nomem;
-
-		fp->next = head->next;
-		if (!fp->next)
+		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+		if (qp->q.fragments_tail == skb)
 			qp->q.fragments_tail = fp;
-		prev->next = fp;
-
-		skb_morph(head, qp->q.fragments);
-		head->next = qp->q.fragments->next;
-
-		consume_skb(qp->q.fragments);
-		qp->q.fragments = head;
+		skb_morph(skb, head);
+		rb_replace_node(&head->rbnode, &skb->rbnode,
+				&qp->q.rb_fragments);
+		consume_skb(head);
+		head = skb;
 	}
 
-	WARN_ON(!head);
 	WARN_ON(head->ip_defrag_offset != 0);
 
 	/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		clone = alloc_skb(0, GFP_ATOMIC);
 		if (!clone)
 			goto out_nomem;
-		clone->next = head->next;
-		head->next = clone;
 		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
 		skb_frag_list_init(head);
 		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 		clone->len = clone->data_len = head->data_len - plen;
-		head->data_len -= clone->len;
-		head->len -= clone->len;
+		skb->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(qp->q.net, clone->truesize);
+		skb_shinfo(head)->frag_list = clone;
+		nextp = &clone->next;
+	} else {
+		nextp = &skb_shinfo(head)->frag_list;
 	}
 
-	skb_shinfo(head)->frag_list = head->next;
 	skb_push(head, head->data - skb_network_header(head));
 
-	for (fp=head->next; fp; fp = fp->next) {
+	/* Traverse the tree in order, to build frag_list. */
+	rbn = rb_next(&head->rbnode);
+	rb_erase(&head->rbnode, &qp->q.rb_fragments);
+	while (rbn) {
+		struct rb_node *rbnext = rb_next(rbn);
+		fp = rb_to_skb(rbn);
+		rb_erase(rbn, &qp->q.rb_fragments);
+		rbn = rbnext;
+		*nextp = fp;
+		nextp = &fp->next;
+		fp->prev = NULL;
+		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
 		head->data_len += fp->len;
 		head->len += fp->len;
 		if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
+	*nextp = NULL;
 	head->next = NULL;
+	head->prev = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
 	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
+	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
 	return 0;
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 0610bdab721c..38d69ef516d5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 					  head->csum);
 
 	fq->q.fragments = NULL;
+	fq->q.rb_fragments = RB_ROOT;
 	fq->q.fragments_tail = NULL;
 
 	return true;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 6edd2ac8ae4b..b4e558ab39fa 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
 	rcu_read_unlock();
 	fq->q.fragments = NULL;
+	fq->q.rb_fragments = RB_ROOT;
 	fq->q.fragments_tail = NULL;
 	return 1;
 
-- 
cgit v1.2.3


From 55f2503c3b69328735e88031ff8d6ba291bd952b Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Tue, 31 Jul 2018 16:51:32 +0800
Subject: PM / reboot: Eliminate race between reboot and suspend

At present, "systemctl suspend" and "shutdown" can run in parrallel. A
system can suspend after devices_shutdown(), and resume. Then the shutdown
task goes on to power off. This causes many devices are not really shut
off. Hence replacing reboot_mutex with system_transition_mutex (renamed
from pm_mutex) to achieve the exclusion. The renaming of pm_mutex as
system_transition_mutex can be better to reflect the purpose of the mutex.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/freezing-of-tasks.txt      | 12 ++++++------
 Documentation/power/suspend-and-cpuhotplug.txt |  6 +++---
 include/linux/suspend.h                        |  2 +-
 kernel/freezer.c                               |  4 +++-
 kernel/power/hibernate.c                       | 15 ++++++++-------
 kernel/power/main.c                            | 12 ++++++------
 kernel/power/suspend.c                         |  4 ++--
 kernel/power/user.c                            |  4 ++--
 kernel/reboot.c                                |  6 +++---
 mm/page_alloc.c                                | 11 ++++++-----
 10 files changed, 40 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index af005770e767..cd283190855a 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -204,26 +204,26 @@ VI. Are there any precautions to be taken to prevent freezing failures?
 
 Yes, there are.
 
-First of all, grabbing the 'pm_mutex' lock to mutually exclude a piece of code
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
 from system-wide sleep such as suspend/hibernation is not encouraged.
 If possible, that piece of code must instead hook onto the suspend/hibernation
 notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
 (kernel/cpu.c) for an example.
 
-However, if that is not feasible, and grabbing 'pm_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&pm_mutex) since
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
 that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'pm_mutex' lock, and hence that other entity failed
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
 to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
 state. As a consequence, the freezer would not be able to freeze that task,
 leading to freezing failure.
 
 However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
 since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'pm_mutex', which will be released
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
 only after the entire suspend/hibernation sequence is complete.
 So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&pm_mutex). That would prevent freezing failures.
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
 
 V. Miscellaneous
 /sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
index 6f55eb960a6d..a8751b8df10e 100644
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ b/Documentation/power/suspend-and-cpuhotplug.txt
@@ -32,7 +32,7 @@ More details follow:
                                     sysfs file
                                         |
                                         v
-                               Acquire pm_mutex lock
+                               Acquire system_transition_mutex lock
                                         |
                                         v
                              Send PM_SUSPEND_PREPARE
@@ -96,10 +96,10 @@ execution during resume):
 
 * thaw tasks
 * send PM_POST_SUSPEND notifications
-* Release pm_mutex lock.
+* Release system_transition_mutex lock.
 
 
-It is to be noted here that the pm_mutex lock is acquired at the very
+It is to be noted here that the system_transition_mutex lock is acquired at the very
 beginning, when we are just starting out to suspend, and then released only
 after the entire cycle is complete (i.e., suspend + resume).
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 440b62f7502e..5a28ac9284f0 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -414,7 +414,7 @@ static inline bool hibernation_available(void) { return false; }
 #define PM_RESTORE_PREPARE	0x0005 /* Going to restore a saved image */
 #define PM_POST_RESTORE		0x0006 /* Restore failed */
 
-extern struct mutex pm_mutex;
+extern struct mutex system_transition_mutex;
 
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 6f56a9e219fa..b162b74611e4 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -15,7 +15,9 @@
 atomic_t system_freezing_cnt = ATOMIC_INIT(0);
 EXPORT_SYMBOL(system_freezing_cnt);
 
-/* indicate whether PM freezing is in effect, protected by pm_mutex */
+/* indicate whether PM freezing is in effect, protected by
+ * system_transition_mutex
+ */
 bool pm_freezing;
 bool pm_nosig_freezing;
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 38f3f96b4b12..abef759de7c8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,7 +338,7 @@ static int create_image(int platform_mode)
  * hibernation_snapshot - Quiesce devices and create a hibernation image.
  * @platform_mode: If set, use platform driver to prepare for the transition.
  *
- * This routine must be called with pm_mutex held.
+ * This routine must be called with system_transition_mutex held.
  */
 int hibernation_snapshot(int platform_mode)
 {
@@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode)
  * hibernation_restore - Quiesce devices and restore from a hibernation image.
  * @platform_mode: If set, use platform driver to prepare for the transition.
  *
- * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snapshot().
+ * This routine must be called with system_transition_mutex held.  If it is
+ * successful, control reappears in the restored target kernel in
+ * hibernation_snapshot().
  */
 int hibernation_restore(int platform_mode)
 {
@@ -806,13 +807,13 @@ static int software_resume(void)
 	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
 	 * is configured into the kernel. Since the regular hibernate
 	 * trigger path is via sysfs which takes a buffer mutex before
-	 * calling hibernate functions (which take pm_mutex) this can
-	 * cause lockdep to complain about a possible ABBA deadlock
+	 * calling hibernate functions (which take system_transition_mutex)
+	 * this can cause lockdep to complain about a possible ABBA deadlock
 	 * which cannot happen since we're in the boot code here and
 	 * sysfs can't be invoked yet. Therefore, we use a subclass
 	 * here to avoid lockdep complaining.
 	 */
-	mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
 
 	if (swsusp_resume_device)
 		goto Check_image;
@@ -900,7 +901,7 @@ static int software_resume(void)
 	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
  Unlock:
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d9706da10930..35b50823d83b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,17 +15,16 @@
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 
 #include "power.h"
 
-DEFINE_MUTEX(pm_mutex);
-
 #ifdef CONFIG_PM_SLEEP
 
 void lock_system_sleep(void)
 {
 	current->flags |= PF_FREEZER_SKIP;
-	mutex_lock(&pm_mutex);
+	mutex_lock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(lock_system_sleep);
 
@@ -37,8 +36,9 @@ void unlock_system_sleep(void)
 	 *
 	 * Reason:
 	 * Fundamentally, we just don't need it, because freezing condition
-	 * doesn't come into effect until we release the pm_mutex lock,
-	 * since the freezer always works with pm_mutex held.
+	 * doesn't come into effect until we release the
+	 * system_transition_mutex lock, since the freezer always works with
+	 * system_transition_mutex held.
 	 *
 	 * More importantly, in the case of hibernation,
 	 * unlock_system_sleep() gets called in snapshot_read() and
@@ -47,7 +47,7 @@ void unlock_system_sleep(void)
 	 * enter the refrigerator, thus causing hibernation to lockup.
 	 */
 	current->flags &= ~PF_FREEZER_SKIP;
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..9e13afe65a14 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state)
 	} else if (!valid_state(state)) {
 		return -EINVAL;
 	}
-	if (!mutex_trylock(&pm_mutex))
+	if (!mutex_trylock(&system_transition_mutex))
 		return -EBUSY;
 
 	if (state == PM_SUSPEND_TO_IDLE)
@@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state)
 	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 	return error;
 }
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index abd225550271..2d8b60a3c86b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!mutex_trylock(&pm_mutex))
+	if (!mutex_trylock(&system_transition_mutex))
 		return -EBUSY;
 
 	lock_device_hotplug();
@@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	}
 
 	unlock_device_hotplug();
-	mutex_unlock(&pm_mutex);
+	mutex_unlock(&system_transition_mutex);
 
 	return error;
 }
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e4ced883d8de..8fb44dec9ad7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -294,7 +294,7 @@ void kernel_power_off(void)
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
 
-static DEFINE_MUTEX(reboot_mutex);
+DEFINE_MUTEX(system_transition_mutex);
 
 /*
  * Reboot system call: for obvious reasons only root may call it,
@@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
 		cmd = LINUX_REBOOT_CMD_HALT;
 
-	mutex_lock(&reboot_mutex);
+	mutex_lock(&system_transition_mutex);
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		kernel_restart(NULL);
@@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		ret = -EINVAL;
 		break;
 	}
-	mutex_unlock(&reboot_mutex);
+	mutex_unlock(&system_transition_mutex);
 	return ret;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..e47f9e467408 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
- * they should always be called with pm_mutex held (gfp_allowed_mask also should
- * only be modified with pm_mutex held, unless the suspend/hibernate code is
- * guaranteed not to run in parallel with that modification).
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
  */
 
 static gfp_t saved_gfp_mask;
 
 void pm_restore_gfp_mask(void)
 {
-	WARN_ON(!mutex_is_locked(&pm_mutex));
+	WARN_ON(!mutex_is_locked(&system_transition_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
@@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void)
 
 void pm_restrict_gfp_mask(void)
 {
-	WARN_ON(!mutex_is_locked(&pm_mutex));
+	WARN_ON(!mutex_is_locked(&system_transition_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
-- 
cgit v1.2.3


From bc2d8d262cba5736332cbc866acb11b1c5748aa9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Aug 2018 08:19:57 +0200
Subject: cpu/hotplug: Fix SMT supported evaluation

Josh reported that the late SMT evaluation in cpu_smt_state_init() sets
cpu_smt_control to CPU_SMT_NOT_SUPPORTED in case that 'nosmt' was supplied
on the kernel command line as it cannot differentiate between SMT disabled
by BIOS and SMT soft disable via 'nosmt'. That wreckages the state and
makes the sysfs interface unusable.

Rework this so that during bringup of the non boot CPUs the availability of
SMT is determined in cpu_smt_allowed(). If a newly booted CPU is not a
'primary' thread then set the local cpu_smt_available marker and evaluate
this explicitely right after the initial SMP bringup has finished.

SMT evaulation on x86 is a trainwreck as the firmware has all the
information _before_ booting the kernel, but there is no interface to query
it.

Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS")
Reported-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c |  2 +-
 include/linux/cpu.h        |  2 ++
 kernel/cpu.c               | 41 ++++++++++++++++++++++++++++-------------
 kernel/smp.c               |  2 ++
 4 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 86f4549b7c9f..be79b7599dda 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,7 +63,7 @@ void __init check_bugs(void)
 	 * identify_boot_cpu() initialized SMT support information, let the
 	 * core code know.
 	 */
-	cpu_smt_check_topology();
+	cpu_smt_check_topology_early();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {
 		pr_info("CPU: ");
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ccdf3a67ce56..b216bd5bfd20 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -178,10 +178,12 @@ enum cpuhp_smt_control {
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
 extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology_early(void);
 extern void cpu_smt_check_topology(void);
 #else
 # define cpu_smt_control		(CPU_SMT_ENABLED)
 static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology_early(void) { }
 static inline void cpu_smt_check_topology(void) { }
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2bc3d2f5b2a5..9d2512dd263c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -347,6 +347,8 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
 EXPORT_SYMBOL_GPL(cpu_smt_control);
 
+static bool cpu_smt_available __read_mostly;
+
 void __init cpu_smt_disable(bool force)
 {
 	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
@@ -363,14 +365,28 @@ void __init cpu_smt_disable(bool force)
 
 /*
  * The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code.
+ * CPU identification. Called from architecture code before non boot CPUs
+ * are brought up.
  */
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_check_topology_early(void)
 {
 	if (!topology_smt_supported())
 		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
 }
 
+/*
+ * If SMT was disabled by BIOS, detect it here, after the CPUs have been
+ * brought online. This ensures the smt/l1tf sysfs entries are consistent
+ * with reality. cpu_smt_available is set to true during the bringup of non
+ * boot CPUs when a SMT sibling is detected. Note, this may overwrite
+ * cpu_smt_control's previous setting.
+ */
+void __init cpu_smt_check_topology(void)
+{
+	if (!cpu_smt_available)
+		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
 static int __init smt_cmdline_disable(char *str)
 {
 	cpu_smt_disable(str && !strcmp(str, "force"));
@@ -380,10 +396,18 @@ early_param("nosmt", smt_cmdline_disable);
 
 static inline bool cpu_smt_allowed(unsigned int cpu)
 {
-	if (cpu_smt_control == CPU_SMT_ENABLED)
+	if (topology_is_primary_thread(cpu))
 		return true;
 
-	if (topology_is_primary_thread(cpu))
+	/*
+	 * If the CPU is not a 'primary' thread and the booted_once bit is
+	 * set then the processor has SMT support. Store this information
+	 * for the late check of SMT support in cpu_smt_check_topology().
+	 */
+	if (per_cpu(cpuhp_state, cpu).booted_once)
+		cpu_smt_available = true;
+
+	if (cpu_smt_control == CPU_SMT_ENABLED)
 		return true;
 
 	/*
@@ -2125,15 +2149,6 @@ static const struct attribute_group cpuhp_smt_attr_group = {
 
 static int __init cpu_smt_state_init(void)
 {
-	/*
-	 * If SMT was disabled by BIOS, detect it here, after the CPUs have
-	 * been brought online.  This ensures the smt/l1tf sysfs entries are
-	 * consistent with reality.  Note this may overwrite cpu_smt_control's
-	 * previous setting.
-	 */
-	if (topology_max_smt_threads() == 1)
-		cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-
 	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
 				  &cpuhp_smt_attr_group);
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d86eec5f51c1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,6 +584,8 @@ void __init smp_init(void)
 		num_nodes, (num_nodes > 1 ? "s" : ""),
 		num_cpus,  (num_cpus  > 1 ? "s" : ""));
 
+	/* Final decision about SMT support */
+	cpu_smt_check_topology();
 	/* Any cleanup work */
 	smp_cpus_done(setup_max_cpus);
 }
-- 
cgit v1.2.3


From 6fdecfe32f903d9f5f35ee4464fd32ede470dcad Mon Sep 17 00:00:00 2001
From: Arun Parameswaran <arun.parameswaran@broadcom.com>
Date: Tue, 7 Aug 2018 10:02:44 -0700
Subject: net: phy: Add support for Broadcom Omega internal Combo GPHY

Add support for the Broadcom Omega SoC internal Combo Ethernet
GPHY to the bcm7xxx phy driver.

Signed-off-by: Arun Parameswaran <arun.parameswaran@broadcom.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 2 ++
 include/linux/brcmphy.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index 01d2ff2f6241..b2b6307d64a4 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -229,6 +229,7 @@ static int bcm7xxx_28nm_config_init(struct phy_device *phydev)
 	phy_read(phydev, MII_BMSR);
 
 	switch (rev) {
+	case 0xa0:
 	case 0xb0:
 		ret = bcm7xxx_28nm_b0_afe_config_init(phydev);
 		break;
@@ -659,6 +660,7 @@ static struct phy_driver bcm7xxx_driver[] = {
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7439_2, "Broadcom BCM7439 (2)"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"),
+	BCM7XXX_28NM_GPHY(PHY_ID_BCM_OMEGA, "Broadcom Omega Combo GPHY"),
 	BCM7XXX_40NM_EPHY(PHY_ID_BCM7346, "Broadcom BCM7346"),
 	BCM7XXX_40NM_EPHY(PHY_ID_BCM7362, "Broadcom BCM7362"),
 	BCM7XXX_40NM_EPHY(PHY_ID_BCM7425, "Broadcom BCM7425"),
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index daa9234a9baf..949e9af8d9d6 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -45,6 +45,7 @@
 #define PHY_ID_BCM7445			0x600d8510
 
 #define PHY_ID_BCM_CYGNUS		0xae025200
+#define PHY_ID_BCM_OMEGA		0xae025100
 
 #define PHY_BCM_OUI_MASK		0xfffffc00
 #define PHY_BCM_OUI_1			0x00206000
-- 
cgit v1.2.3


From 8b92d0e3d400390660a26ef7f475524700fb86cf Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Wed, 8 Aug 2018 08:35:29 +0200
Subject: nvme.h: fixup ANA group descriptor format

ANA Phase 3 draft had the 'reserved' field in the group descriptor
format set to '23:17' (so that the first namespace identifier started
at byte 24), but that got move with the approved TP to '31:17'
(so that the first namespace identifier started at byte 32).

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 64c9175723de..a661861e9d56 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -446,7 +446,7 @@ struct nvme_ana_group_desc {
 	__le32	nnsids;
 	__le64	chgcnt;
 	__u8	state;
-	__u8	rsvd17[7];
+	__u8	rsvd17[15];
 	__le32	nsids[];
 };
 
-- 
cgit v1.2.3


From 93045d5942da60801e71764597d448cf37a798c1 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 7 Aug 2018 23:01:05 -0700
Subject: nvme.h: add support for ns write protect definitions

Add various definitions from NVMe 1.3 TP 4005.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a661861e9d56..68e91ef5494c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -259,7 +259,7 @@ struct nvme_id_ctrl {
 	__le16			awun;
 	__le16			awupf;
 	__u8			nvscc;
-	__u8			rsvd531;
+	__u8			nwpc;
 	__le16			acwu;
 	__u8			rsvd534[2];
 	__le32			sgls;
@@ -320,7 +320,9 @@ struct nvme_id_ns {
 	__u8			nvmcap[16];
 	__u8			rsvd64[28];
 	__le32			anagrpid;
-	__u8			rsvd96[8];
+	__u8			rsvd96[3];
+	__u8			nsattr;
+	__u8			rsvd100[4];
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
@@ -794,6 +796,7 @@ enum {
 	NVME_FEAT_HOST_ID	= 0x81,
 	NVME_FEAT_RESV_MASK	= 0x82,
 	NVME_FEAT_RESV_PERSIST	= 0x83,
+	NVME_FEAT_WRITE_PROTECT	= 0x84,
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
@@ -807,6 +810,14 @@ enum {
 	NVME_FWACT_ACTV		= (2 << 3),
 };
 
+/* NVMe Namespace Write Protect State */
+enum {
+	NVME_NS_NO_WRITE_PROTECT = 0,
+	NVME_NS_WRITE_PROTECT,
+	NVME_NS_WRITE_PROTECT_POWER_CYCLE,
+	NVME_NS_WRITE_PROTECT_PERMANENT,
+};
+
 #define NVME_MAX_CHANGED_NAMESPACES	1024
 
 struct nvme_identify {
@@ -1153,6 +1164,8 @@ enum {
 	NVME_SC_SGL_INVALID_OFFSET	= 0x16,
 	NVME_SC_SGL_INVALID_SUBTYPE	= 0x17,
 
+	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
+
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
-- 
cgit v1.2.3


From 212dfd909ea8b630e5d6fa4d25aeec9c4b4b14a5 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Tue, 7 Aug 2018 22:06:52 +0200
Subject: netfilter: nfnetlink_osf: add missing enum in nfnetlink_osf uapi
 header

xt_osf_window_size_options was originally part of
include/uapi/linux/netfilter/xt_osf.h, restore it.

Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_osf.h      | 12 ------------
 include/uapi/linux/netfilter/nfnetlink_osf.h | 12 ++++++++++++
 include/uapi/linux/netfilter/xt_osf.h        |  1 +
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
index a7311bc03d3a..ecf7dab81e9e 100644
--- a/include/linux/netfilter/nfnetlink_osf.h
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -4,18 +4,6 @@
 
 #include <uapi/linux/netfilter/nfnetlink_osf.h>
 
-/* Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum nf_osf_window_size_options {
-	OSF_WSS_PLAIN   = 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
-
 enum osf_fmatch_states {
 	/* Packet does not match the fingerprint */
 	FMATCH_WRONG = 0,
diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
index 3b93fbb9fc24..76a3527df5dd 100644
--- a/include/uapi/linux/netfilter/nfnetlink_osf.h
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -88,6 +88,18 @@ enum iana_options {
 	OSFOPT_EMPTY = 255,
 };
 
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN	= 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
 enum nf_osf_attr_type {
 	OSF_ATTR_UNSPEC,
 	OSF_ATTR_FINGER,
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index c56c59605c2b..24102b5286ec 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -46,6 +46,7 @@
 #define xt_osf_finger		nf_osf_finger
 #define xt_osf_nlmsg		nf_osf_nlmsg
 
+#define xt_osf_window_size_options	nf_osf_window_size_options
 #define xt_osf_attr_type	nf_osf_attr_type
 #define xt_osf_msg_types	nf_osf_msg_types
 
-- 
cgit v1.2.3


From 342ac8448f1fb213908656ae5581d0f37a5954e8 Mon Sep 17 00:00:00 2001
From: Denis Drozdov <denisd@mellanox.com>
Date: Wed, 8 Aug 2018 16:23:48 -0700
Subject: net/mlx5: Use max_num_eqs for calculation of required MSIX vectors

New firmware has defined new HCA capability field called "max_num_eqs",
that is the number of available EQs after subtracting reserved FW EQs.

Before this capability the FW reported the EQ number in "log_max_eqs",
the reported value also contained FW reserved EQs, but the driver might
be failing to load on 320 cpus systems due to the fact that FW
reserved EQs were not available to the driver.

Now the driver has to obtain max_num_eqs value from new FW to get real
number of EQs available.

Signed-off-by: Denis Drozdov <denisd@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 +++-
 include/linux/mlx5/mlx5_ifc.h                  | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 03b9c6733eed..cf3e4a659052 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -323,7 +323,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
-	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
+	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
+		      MLX5_CAP_GEN(dev, max_num_eqs) :
+		      1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 	int err;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 60c2308fe062..63e1ce4c4826 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1133,7 +1133,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         general_obj_types[0x40];
 
-	u8         reserved_at_440[0x40];
+	u8         reserved_at_440[0x20];
+
+	u8         reserved_at_460[0x10];
+	u8         max_num_eqs[0x10];
 
 	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
-- 
cgit v1.2.3


From cc9c82a8668cf98748bfbaa83c5c092d5115c25b Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 8 Aug 2018 16:23:49 -0700
Subject: net/mlx5: Rename modify/query_vport state related enums

Modify and query vport state commands share the same admin_state and
op_mod values, rename the enums to fit them both.

In addition, remove the esw prefix from the admin state enum as this
also applied for vnic.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c     | 12 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c      |  8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c     | 10 +++++-----
 include/linux/mlx5/device.h                           |  6 +++---
 include/linux/mlx5/mlx5_ifc.h                         |  4 ++--
 6 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a2fb21ca5767..2731ba2d8049 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -228,7 +228,7 @@ static void mlx5e_update_carrier(struct mlx5e_priv *priv)
 	u8 port_state;
 
 	port_state = mlx5_query_vport_state(mdev,
-					    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT,
+					    MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT,
 					    0);
 
 	if (port_state == VPORT_STATE_UP) {
@@ -3916,9 +3916,9 @@ static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
 static int mlx5_vport_link2ifla(u8 esw_link)
 {
 	switch (esw_link) {
-	case MLX5_ESW_VPORT_ADMIN_STATE_DOWN:
+	case MLX5_VPORT_ADMIN_STATE_DOWN:
 		return IFLA_VF_LINK_STATE_DISABLE;
-	case MLX5_ESW_VPORT_ADMIN_STATE_UP:
+	case MLX5_VPORT_ADMIN_STATE_UP:
 		return IFLA_VF_LINK_STATE_ENABLE;
 	}
 	return IFLA_VF_LINK_STATE_AUTO;
@@ -3928,11 +3928,11 @@ static int mlx5_ifla_link2vport(u8 ifla_link)
 {
 	switch (ifla_link) {
 	case IFLA_VF_LINK_STATE_DISABLE:
-		return MLX5_ESW_VPORT_ADMIN_STATE_DOWN;
+		return MLX5_VPORT_ADMIN_STATE_DOWN;
 	case IFLA_VF_LINK_STATE_ENABLE:
-		return MLX5_ESW_VPORT_ADMIN_STATE_UP;
+		return MLX5_VPORT_ADMIN_STATE_UP;
 	}
-	return MLX5_ESW_VPORT_ADMIN_STATE_AUTO;
+	return MLX5_VPORT_ADMIN_STATE_AUTO;
 }
 
 static int mlx5e_set_vf_link_state(struct net_device *dev, int vf,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 8e3c5b4b90ab..c9cc9747d21d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -698,8 +698,8 @@ static int mlx5e_rep_open(struct net_device *dev)
 		goto unlock;
 
 	if (!mlx5_modify_vport_admin_state(priv->mdev,
-					   MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
-					   rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_UP))
+					   MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+					   rep->vport, MLX5_VPORT_ADMIN_STATE_UP))
 		netif_carrier_on(dev);
 
 unlock:
@@ -716,8 +716,8 @@ static int mlx5e_rep_close(struct net_device *dev)
 
 	mutex_lock(&priv->state_lock);
 	mlx5_modify_vport_admin_state(priv->mdev,
-				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
-				      rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
+				      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+				      rep->vport, MLX5_VPORT_ADMIN_STATE_DOWN);
 	ret = mlx5e_close_locked(dev);
 	mutex_unlock(&priv->state_lock);
 	return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 4d316cc9b008..35ded91203f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -74,7 +74,7 @@ static int mlx5e_test_link_state(struct mlx5e_priv *priv)
 	if (!netif_carrier_ok(priv->netdev))
 		return 1;
 
-	port_state = mlx5_query_vport_state(priv->mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
+	port_state = mlx5_query_vport_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0);
 	return port_state == VPORT_STATE_UP ? 0 : 1;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 40dba9e8af92..cd0760a8d45a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1469,7 +1469,7 @@ static void esw_apply_vport_conf(struct mlx5_eswitch *esw,
 		return;
 
 	mlx5_modify_vport_admin_state(esw->dev,
-				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+				      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
 				      vport_num,
 				      vport->info.link_state);
 	mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, vport->info.mac);
@@ -1582,9 +1582,9 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
 	esw_vport_disable_qos(esw, vport_num);
 	if (vport_num && esw->mode == SRIOV_LEGACY) {
 		mlx5_modify_vport_admin_state(esw->dev,
-					      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+					      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
 					      vport_num,
-					      MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
+					      MLX5_VPORT_ADMIN_STATE_DOWN);
 		esw_vport_disable_egress_acl(esw, vport);
 		esw_vport_disable_ingress_acl(esw, vport);
 		esw_vport_destroy_drop_counters(vport);
@@ -1736,7 +1736,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		struct mlx5_vport *vport = &esw->vports[vport_num];
 
 		vport->vport = vport_num;
-		vport->info.link_state = MLX5_ESW_VPORT_ADMIN_STATE_AUTO;
+		vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO;
 		vport->dev = dev;
 		INIT_WORK(&vport->vport_change_handler,
 			  esw_vport_change_handler);
@@ -1860,7 +1860,7 @@ int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
 	evport = &esw->vports[vport];
 
 	err = mlx5_modify_vport_admin_state(esw->dev,
-					    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+					    MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
 					    vport, link_state);
 	if (err) {
 		mlx5_core_warn(esw->dev,
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index d489494b0a84..11fa4e66afc5 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -946,9 +946,9 @@ enum {
 };
 
 enum {
-	MLX5_ESW_VPORT_ADMIN_STATE_DOWN  = 0x0,
-	MLX5_ESW_VPORT_ADMIN_STATE_UP    = 0x1,
-	MLX5_ESW_VPORT_ADMIN_STATE_AUTO  = 0x2,
+	MLX5_VPORT_ADMIN_STATE_DOWN  = 0x0,
+	MLX5_VPORT_ADMIN_STATE_UP    = 0x1,
+	MLX5_VPORT_ADMIN_STATE_AUTO  = 0x2,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 63e1ce4c4826..6ead9c1a5396 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3764,8 +3764,8 @@ struct mlx5_ifc_query_vport_state_out_bits {
 };
 
 enum {
-	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT  = 0x0,
-	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT   = 0x1,
+	MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT  = 0x0,
+	MLX5_VPORT_STATE_OP_MOD_ESW_VPORT   = 0x1,
 };
 
 struct mlx5_ifc_query_vport_state_in_bits {
-- 
cgit v1.2.3


From 29d8ebd44de8999e292dbb4de76744d4a41039ec Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Wed, 8 Aug 2018 16:23:51 -0700
Subject: net/mlx5: Remove unused mlx5_query_vport_admin_state

mlx5_query_vport_admin_state() is not used anywhere. Remove it.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 10 ----------
 include/linux/mlx5/vport.h                      |  2 --
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 7eecd5b07bb1..fa6fa171b94c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -64,16 +64,6 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_vport_state);
 
-u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
-{
-	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
-
-	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
-
-	return MLX5_GET(query_vport_state_out, out, admin_state);
-}
-EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state);
-
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 state)
 {
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 9208cb8809ac..7e7c6dfcfb09 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -43,8 +43,6 @@ enum {
 };
 
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
-u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
-				u16 vport);
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
-- 
cgit v1.2.3


From 74fe7b551f3385fa585d92616c85b3a575b2b2cb Mon Sep 17 00:00:00 2001
From: Crestez Dan Leonard <leonard.crestez@intel.com>
Date: Tue, 7 Aug 2018 17:52:17 +0300
Subject: regmap: Add regmap_noinc_read API

The regmap API usually assumes that bulk read operations will read a
range of registers but some I2C/SPI devices have certain registers for
which a such a read operation will return data from an internal FIFO
instead. Add an explicit API to support bulk read without range semantics.

Some linux drivers use regmap_bulk_read or regmap_raw_read for such
registers, for example mpu6050 or bmi150 from IIO. This only happens to
work because when caching is disabled a single regmap read op will map
to a single bus read op (as desired). This breaks if caching is enabled and
reg+1 happens to be a cacheable register.

Without regmap support refactoring a driver to enable regmap caching
requires separate I2C and SPI paths. This is exactly what regmap is
supposed to help avoid.

Suggested-by: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Crestez Dan Leonard <leonard.crestez@intel.com>
Signed-off-by: Stefan Popa <stefan.popa@analog.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/internal.h |  3 ++
 drivers/base/regmap/regmap.c   | 79 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/regmap.h         | 19 ++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 53785e0e297a..a6bf34d6394e 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -94,10 +94,12 @@ struct regmap {
 	bool (*readable_reg)(struct device *dev, unsigned int reg);
 	bool (*volatile_reg)(struct device *dev, unsigned int reg);
 	bool (*precious_reg)(struct device *dev, unsigned int reg);
+	bool (*readable_noinc_reg)(struct device *dev, unsigned int reg);
 	const struct regmap_access_table *wr_table;
 	const struct regmap_access_table *rd_table;
 	const struct regmap_access_table *volatile_table;
 	const struct regmap_access_table *precious_table;
+	const struct regmap_access_table *rd_noinc_table;
 
 	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
 	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
@@ -181,6 +183,7 @@ bool regmap_writeable(struct regmap *map, unsigned int reg);
 bool regmap_readable(struct regmap *map, unsigned int reg);
 bool regmap_volatile(struct regmap *map, unsigned int reg);
 bool regmap_precious(struct regmap *map, unsigned int reg);
+bool regmap_readable_noinc(struct regmap *map, unsigned int reg);
 
 int _regmap_write(struct regmap *map, unsigned int reg,
 		  unsigned int val);
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 3bc84885eb91..0360a90ad6b6 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -168,6 +168,17 @@ bool regmap_precious(struct regmap *map, unsigned int reg)
 	return false;
 }
 
+bool regmap_readable_noinc(struct regmap *map, unsigned int reg)
+{
+	if (map->readable_noinc_reg)
+		return map->readable_noinc_reg(map->dev, reg);
+
+	if (map->rd_noinc_table)
+		return regmap_check_range_table(map, reg, map->rd_noinc_table);
+
+	return true;
+}
+
 static bool regmap_volatile_range(struct regmap *map, unsigned int reg,
 	size_t num)
 {
@@ -766,10 +777,12 @@ struct regmap *__regmap_init(struct device *dev,
 	map->rd_table = config->rd_table;
 	map->volatile_table = config->volatile_table;
 	map->precious_table = config->precious_table;
+	map->rd_noinc_table = config->rd_noinc_table;
 	map->writeable_reg = config->writeable_reg;
 	map->readable_reg = config->readable_reg;
 	map->volatile_reg = config->volatile_reg;
 	map->precious_reg = config->precious_reg;
+	map->readable_noinc_reg = config->readable_noinc_reg;
 	map->cache_type = config->cache_type;
 
 	spin_lock_init(&map->async_lock);
@@ -1285,6 +1298,7 @@ int regmap_reinit_cache(struct regmap *map, const struct regmap_config *config)
 	map->readable_reg = config->readable_reg;
 	map->volatile_reg = config->volatile_reg;
 	map->precious_reg = config->precious_reg;
+	map->readable_noinc_reg = config->readable_noinc_reg;
 	map->cache_type = config->cache_type;
 
 	regmap_debugfs_init(map, config->name);
@@ -2564,7 +2578,70 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 EXPORT_SYMBOL_GPL(regmap_raw_read);
 
 /**
- * regmap_field_read() - Read a value to a single register field
+ * regmap_noinc_read(): Read data from a register without incrementing the
+ *			register number
+ *
+ * @map: Register map to read from
+ * @reg: Register to read from
+ * @val: Pointer to data buffer
+ * @val_len: Length of output buffer in bytes.
+ *
+ * The regmap API usually assumes that bulk bus read operations will read a
+ * range of registers. Some devices have certain registers for which a read
+ * operation read will read from an internal FIFO.
+ *
+ * The target register must be volatile but registers after it can be
+ * completely unrelated cacheable registers.
+ *
+ * This will attempt multiple reads as required to read val_len bytes.
+ *
+ * A value of zero will be returned on success, a negative errno will be
+ * returned in error cases.
+ */
+int regmap_noinc_read(struct regmap *map, unsigned int reg,
+		      void *val, size_t val_len)
+{
+	size_t read_len;
+	int ret;
+
+	if (!map->bus)
+		return -EINVAL;
+	if (!map->bus->read)
+		return -ENOTSUPP;
+	if (val_len % map->format.val_bytes)
+		return -EINVAL;
+	if (!IS_ALIGNED(reg, map->reg_stride))
+		return -EINVAL;
+	if (val_len == 0)
+		return -EINVAL;
+
+	map->lock(map->lock_arg);
+
+	if (!regmap_volatile(map, reg) || !regmap_readable_noinc(map, reg)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	while (val_len) {
+		if (map->max_raw_read && map->max_raw_read < val_len)
+			read_len = map->max_raw_read;
+		else
+			read_len = val_len;
+		ret = _regmap_raw_read(map, reg, val, read_len);
+		if (ret)
+			goto out_unlock;
+		val = ((u8 *)val) + read_len;
+		val_len -= read_len;
+	}
+
+out_unlock:
+	map->unlock(map->lock_arg);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_noinc_read);
+
+/**
+ * regmap_field_read(): Read a value to a single register field
  *
  * @field: Register field to read from
  * @val: Pointer to store read value
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4f38068ffb71..19df946499d2 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -268,6 +268,13 @@ typedef void (*regmap_unlock)(void *);
  *                field is NULL but precious_table (see below) is not, the
  *                check is performed on such table (a register is precious if
  *                it belongs to one of the ranges specified by precious_table).
+ * @readable_noinc_reg: Optional callback returning true if the register
+ *			supports multiple read operations without incrementing
+ *			the register number. If this field is NULL but
+ *			rd_noinc_table (see below) is not, the check is
+ *			performed on such table (a register is no increment
+ *			readable if it belongs to one of the ranges specified
+ *			by rd_noinc_table).
  * @disable_locking: This regmap is either protected by external means or
  *                   is guaranteed not be be accessed from multiple threads.
  *                   Don't use any locking mechanisms.
@@ -295,6 +302,7 @@ typedef void (*regmap_unlock)(void *);
  * @rd_table:     As above, for read access.
  * @volatile_table: As above, for volatile registers.
  * @precious_table: As above, for precious registers.
+ * @rd_noinc_table: As above, for no increment readable registers.
  * @reg_defaults: Power on reset values for registers (for use with
  *                register cache support).
  * @num_reg_defaults: Number of elements in reg_defaults.
@@ -344,6 +352,7 @@ struct regmap_config {
 	bool (*readable_reg)(struct device *dev, unsigned int reg);
 	bool (*volatile_reg)(struct device *dev, unsigned int reg);
 	bool (*precious_reg)(struct device *dev, unsigned int reg);
+	bool (*readable_noinc_reg)(struct device *dev, unsigned int reg);
 
 	bool disable_locking;
 	regmap_lock lock;
@@ -360,6 +369,7 @@ struct regmap_config {
 	const struct regmap_access_table *rd_table;
 	const struct regmap_access_table *volatile_table;
 	const struct regmap_access_table *precious_table;
+	const struct regmap_access_table *rd_noinc_table;
 	const struct reg_default *reg_defaults;
 	unsigned int num_reg_defaults;
 	enum regcache_type cache_type;
@@ -946,6 +956,8 @@ int regmap_raw_write_async(struct regmap *map, unsigned int reg,
 int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val);
 int regmap_raw_read(struct regmap *map, unsigned int reg,
 		    void *val, size_t val_len);
+int regmap_noinc_read(struct regmap *map, unsigned int reg,
+		      void *val, size_t val_len);
 int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
 		     size_t val_count);
 int regmap_update_bits_base(struct regmap *map, unsigned int reg,
@@ -1196,6 +1208,13 @@ static inline int regmap_raw_read(struct regmap *map, unsigned int reg,
 	return -EINVAL;
 }
 
+static inline int regmap_noinc_read(struct regmap *map, unsigned int reg,
+				    void *val, size_t val_len)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_bulk_read(struct regmap *map, unsigned int reg,
 				   void *val, size_t val_count)
 {
-- 
cgit v1.2.3


From b1f4267cc5448d20ae0c515a74141e74365e78a3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Aug 2018 07:47:28 -0700
Subject: block: Remove two superfluous #include directives

Commit 12f5b9314545 ("blk-mq: Remove generation seqeunce") removed the
only seqcount_t and u64_stats_sync instances from <linux/blkdev.h> but
did not remove the corresponding #include directives. Since these
include directives are no longer needed, remove them.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Hannes Reinecke <hare@suse.com>,
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 050d599f5ea9..d6869e0e2b64 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,8 +27,6 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
-#include <linux/seqlock.h>
-#include <linux/u64_stats_sync.h>
 
 struct module;
 struct scsi_ioctl_command;
-- 
cgit v1.2.3


From 6bad9b210a228d2fe0e0efe26d9b115348529cee Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Aug 2018 07:53:36 -0700
Subject: blkcg: Introduce blkg_root_lookup()

This new function will be used in a later patch to verify whether a
queue has been dissociated from the cgroup controller before being
released.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Alexandru Moise <00moses.alexander00@gmail.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f7b910768306..1361cfc9b878 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -341,6 +341,23 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 	return __blkg_lookup(blkcg, q, false);
 }
 
+/**
+ * blkg_lookup - look up blkg for the specified request queue
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q)
+{
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+	blkg = blkg_lookup(&blkcg_root, q);
+	rcu_read_unlock();
+
+	return blkg;
+}
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
@@ -864,6 +881,7 @@ static inline bool blk_cgroup_congested(void) { return false; }
 static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
-- 
cgit v1.2.3


From 209b43759d65b2cc99ce7757249aacc82b03c4e2 Mon Sep 17 00:00:00 2001
From: Michael Büsch <m@bues.ch>
Date: Tue, 31 Jul 2018 22:15:09 +0200
Subject: ssb: Remove SSB_WARN_ON, SSB_BUG_ON and SSB_DEBUG

Use the standard WARN_ON instead.
If a small kernel is desired, WARN_ON can be disabled globally.

Also remove SSB_DEBUG. Besides WARN_ON it only adds a tiny debug check.
Include this check unconditionally.

Signed-off-by: Michael Buesch <m@bues.ch>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 arch/mips/configs/bcm47xx_defconfig |  1 -
 arch/powerpc/configs/wii_defconfig  |  1 -
 drivers/ssb/Kconfig                 |  9 ---------
 drivers/ssb/driver_chipcommon.c     |  8 ++++----
 drivers/ssb/driver_chipcommon_pmu.c | 10 +++++-----
 drivers/ssb/driver_gpio.c           |  4 ++--
 drivers/ssb/driver_pcicore.c        |  6 +++---
 drivers/ssb/embedded.c              | 10 +++++-----
 drivers/ssb/host_soc.c              | 12 ++++++------
 drivers/ssb/main.c                  | 38 +++++++++++++++++--------------------
 drivers/ssb/pci.c                   | 19 ++++++-------------
 drivers/ssb/pcmcia.c                | 14 +++++++-------
 drivers/ssb/scan.c                  |  4 ++--
 drivers/ssb/sdio.c                  | 12 ++++++------
 drivers/ssb/ssb_private.h           |  9 ---------
 include/linux/ssb/ssb.h             |  2 --
 16 files changed, 63 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig
index fad8e964f14c..ba800a892384 100644
--- a/arch/mips/configs/bcm47xx_defconfig
+++ b/arch/mips/configs/bcm47xx_defconfig
@@ -66,7 +66,6 @@ CONFIG_HW_RANDOM=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_WATCHDOG=y
 CONFIG_BCM47XX_WDT=y
-CONFIG_SSB_DEBUG=y
 CONFIG_SSB_DRIVER_GIGE=y
 CONFIG_BCMA_DRIVER_GMAC_CMN=y
 CONFIG_USB=y
diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig
index 10940533da71..f5c366b02828 100644
--- a/arch/powerpc/configs/wii_defconfig
+++ b/arch/powerpc/configs/wii_defconfig
@@ -78,7 +78,6 @@ CONFIG_GPIO_HLWD=y
 CONFIG_POWER_RESET=y
 CONFIG_POWER_RESET_GPIO=y
 # CONFIG_HWMON is not set
-CONFIG_SSB_DEBUG=y
 CONFIG_FB=y
 # CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig
index 6c438c819eb9..df30e1323252 100644
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -89,15 +89,6 @@ config SSB_HOST_SOC
 
 	  If unsure, say N
 
-config SSB_DEBUG
-	bool "SSB debugging"
-	depends on SSB
-	help
-	  This turns on additional runtime checks and debugging
-	  messages. Turn this on for SSB troubleshooting.
-
-	  If unsure, say N
-
 config SSB_SERIAL
 	bool
 	depends on SSB
diff --git a/drivers/ssb/driver_chipcommon.c b/drivers/ssb/driver_chipcommon.c
index 48050c6fd847..99a4656d113d 100644
--- a/drivers/ssb/driver_chipcommon.c
+++ b/drivers/ssb/driver_chipcommon.c
@@ -56,7 +56,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc,
 
 	if (cc->capabilities & SSB_CHIPCO_CAP_PMU)
 		return; /* PMU controls clockmode, separated function needed */
-	SSB_WARN_ON(ccdev->id.revision >= 20);
+	WARN_ON(ccdev->id.revision >= 20);
 
 	/* chipcommon cores prior to rev6 don't support dynamic clock control */
 	if (ccdev->id.revision < 6)
@@ -111,7 +111,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc,
 		}
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 }
 
@@ -164,7 +164,7 @@ static int chipco_pctl_clockfreqlimit(struct ssb_chipcommon *cc, int get_max)
 			divisor = 32;
 			break;
 		default:
-			SSB_WARN_ON(1);
+			WARN_ON(1);
 		}
 	} else if (cc->dev->id.revision < 10) {
 		switch (clocksrc) {
@@ -277,7 +277,7 @@ static void calc_fast_powerup_delay(struct ssb_chipcommon *cc)
 	minfreq = chipco_pctl_clockfreqlimit(cc, 0);
 	pll_on_delay = chipco_read32(cc, SSB_CHIPCO_PLLONDELAY);
 	tmp = (((pll_on_delay + 2) * 1000000) + (minfreq - 1)) / minfreq;
-	SSB_WARN_ON(tmp & ~0xFFFF);
+	WARN_ON(tmp & ~0xFFFF);
 
 	cc->fast_pwrup_delay = tmp;
 }
diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c
index e28682a53cdf..0f60e90ded26 100644
--- a/drivers/ssb/driver_chipcommon_pmu.c
+++ b/drivers/ssb/driver_chipcommon_pmu.c
@@ -128,7 +128,7 @@ static void ssb_pmu0_pllinit_r0(struct ssb_chipcommon *cc,
 			      ~(1 << SSB_PMURES_5354_BB_PLL_PU));
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	for (i = 1500; i; i--) {
 		tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
@@ -265,7 +265,7 @@ static void ssb_pmu1_pllinit_r0(struct ssb_chipcommon *cc,
 		buffer_strength = 0x222222;
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	for (i = 1500; i; i--) {
 		tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST);
@@ -501,7 +501,7 @@ static void ssb_pmu_resources_init(struct ssb_chipcommon *cc)
 					      ~(depend_tab[i].depend));
 				break;
 			default:
-				SSB_WARN_ON(1);
+				WARN_ON(1);
 			}
 		}
 	}
@@ -568,12 +568,12 @@ void ssb_pmu_set_ldo_voltage(struct ssb_chipcommon *cc,
 			mask = 0x3F;
 			break;
 		default:
-			SSB_WARN_ON(1);
+			WARN_ON(1);
 			return;
 		}
 		break;
 	case 0x4312:
-		if (SSB_WARN_ON(id != LDO_PAREF))
+		if (WARN_ON(id != LDO_PAREF))
 			return;
 		addr = 0;
 		shift = 21;
diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c
index 6ce4abf7d473..e809dae4c470 100644
--- a/drivers/ssb/driver_gpio.c
+++ b/drivers/ssb/driver_gpio.c
@@ -461,7 +461,7 @@ int ssb_gpio_init(struct ssb_bus *bus)
 	else if (ssb_extif_available(&bus->extif))
 		return ssb_gpio_extif_init(bus);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 
 	return -1;
 }
@@ -473,7 +473,7 @@ int ssb_gpio_unregister(struct ssb_bus *bus)
 		gpiochip_remove(&bus->gpio);
 		return 0;
 	} else {
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 
 	return -1;
diff --git a/drivers/ssb/driver_pcicore.c b/drivers/ssb/driver_pcicore.c
index ae80b3171523..6a5622e0ded5 100644
--- a/drivers/ssb/driver_pcicore.c
+++ b/drivers/ssb/driver_pcicore.c
@@ -115,7 +115,7 @@ static int ssb_extpci_read_config(struct ssb_pcicore *pc,
 	u32 addr, val;
 	void __iomem *mmio;
 
-	SSB_WARN_ON(!pc->hostmode);
+	WARN_ON(!pc->hostmode);
 	if (unlikely(len != 1 && len != 2 && len != 4))
 		goto out;
 	addr = get_cfgspace_addr(pc, bus, dev, func, off);
@@ -161,7 +161,7 @@ static int ssb_extpci_write_config(struct ssb_pcicore *pc,
 	u32 addr, val = 0;
 	void __iomem *mmio;
 
-	SSB_WARN_ON(!pc->hostmode);
+	WARN_ON(!pc->hostmode);
 	if (unlikely(len != 1 && len != 2 && len != 4))
 		goto out;
 	addr = get_cfgspace_addr(pc, bus, dev, func, off);
@@ -702,7 +702,7 @@ int ssb_pcicore_dev_irqvecs_enable(struct ssb_pcicore *pc,
 		/* Calculate the "coremask" for the device. */
 		coremask = (1 << dev->core_index);
 
-		SSB_WARN_ON(bus->bustype != SSB_BUSTYPE_PCI);
+		WARN_ON(bus->bustype != SSB_BUSTYPE_PCI);
 		err = pci_read_config_dword(bus->host_pci, SSB_PCI_IRQMASK, &tmp);
 		if (err)
 			goto out;
diff --git a/drivers/ssb/embedded.c b/drivers/ssb/embedded.c
index c39ddf8988c1..8254ed25e063 100644
--- a/drivers/ssb/embedded.c
+++ b/drivers/ssb/embedded.c
@@ -77,7 +77,7 @@ u32 ssb_gpio_in(struct ssb_bus *bus, u32 mask)
 	else if (ssb_extif_available(&bus->extif))
 		res = ssb_extif_gpio_in(&bus->extif, mask);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	spin_unlock_irqrestore(&bus->gpio_lock, flags);
 
 	return res;
@@ -95,7 +95,7 @@ u32 ssb_gpio_out(struct ssb_bus *bus, u32 mask, u32 value)
 	else if (ssb_extif_available(&bus->extif))
 		res = ssb_extif_gpio_out(&bus->extif, mask, value);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	spin_unlock_irqrestore(&bus->gpio_lock, flags);
 
 	return res;
@@ -113,7 +113,7 @@ u32 ssb_gpio_outen(struct ssb_bus *bus, u32 mask, u32 value)
 	else if (ssb_extif_available(&bus->extif))
 		res = ssb_extif_gpio_outen(&bus->extif, mask, value);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	spin_unlock_irqrestore(&bus->gpio_lock, flags);
 
 	return res;
@@ -145,7 +145,7 @@ u32 ssb_gpio_intmask(struct ssb_bus *bus, u32 mask, u32 value)
 	else if (ssb_extif_available(&bus->extif))
 		res = ssb_extif_gpio_intmask(&bus->extif, mask, value);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	spin_unlock_irqrestore(&bus->gpio_lock, flags);
 
 	return res;
@@ -163,7 +163,7 @@ u32 ssb_gpio_polarity(struct ssb_bus *bus, u32 mask, u32 value)
 	else if (ssb_extif_available(&bus->extif))
 		res = ssb_extif_gpio_polarity(&bus->extif, mask, value);
 	else
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	spin_unlock_irqrestore(&bus->gpio_lock, flags);
 
 	return res;
diff --git a/drivers/ssb/host_soc.c b/drivers/ssb/host_soc.c
index eadaedf466f6..3b438480515c 100644
--- a/drivers/ssb/host_soc.c
+++ b/drivers/ssb/host_soc.c
@@ -61,7 +61,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer,
 	case sizeof(u16): {
 		__le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		while (count) {
 			*buf = (__force __le16)__raw_readw(addr);
 			buf++;
@@ -72,7 +72,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer,
 	case sizeof(u32): {
 		__le32 *buf = buffer;
 
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		while (count) {
 			*buf = (__force __le32)__raw_readl(addr);
 			buf++;
@@ -81,7 +81,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer,
 		break;
 	}
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 }
 #endif /* CONFIG_SSB_BLOCKIO */
@@ -134,7 +134,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer,
 	case sizeof(u16): {
 		const __le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		while (count) {
 			__raw_writew((__force u16)(*buf), addr);
 			buf++;
@@ -145,7 +145,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer,
 	case sizeof(u32): {
 		const __le32 *buf = buffer;
 
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		while (count) {
 			__raw_writel((__force u32)(*buf), addr);
 			buf++;
@@ -154,7 +154,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer,
 		break;
 	}
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 }
 #endif /* CONFIG_SSB_BLOCKIO */
diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c
index 9da56d2fdbd3..0a26984acb2c 100644
--- a/drivers/ssb/main.c
+++ b/drivers/ssb/main.c
@@ -209,7 +209,7 @@ int ssb_devices_freeze(struct ssb_bus *bus, struct ssb_freeze_context *ctx)
 
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->bus = bus;
-	SSB_WARN_ON(bus->nr_devices > ARRAY_SIZE(ctx->device_frozen));
+	WARN_ON(bus->nr_devices > ARRAY_SIZE(ctx->device_frozen));
 
 	for (i = 0; i < bus->nr_devices; i++) {
 		sdev = ssb_device_get(&bus->devices[i]);
@@ -220,7 +220,7 @@ int ssb_devices_freeze(struct ssb_bus *bus, struct ssb_freeze_context *ctx)
 			continue;
 		}
 		sdrv = drv_to_ssb_drv(sdev->dev->driver);
-		if (SSB_WARN_ON(!sdrv->remove))
+		if (WARN_ON(!sdrv->remove))
 			continue;
 		sdrv->remove(sdev);
 		ctx->device_frozen[i] = 1;
@@ -248,10 +248,10 @@ int ssb_devices_thaw(struct ssb_freeze_context *ctx)
 			continue;
 		sdev = &bus->devices[i];
 
-		if (SSB_WARN_ON(!sdev->dev || !sdev->dev->driver))
+		if (WARN_ON(!sdev->dev || !sdev->dev->driver))
 			continue;
 		sdrv = drv_to_ssb_drv(sdev->dev->driver);
-		if (SSB_WARN_ON(!sdrv || !sdrv->probe))
+		if (WARN_ON(!sdrv || !sdrv->probe))
 			continue;
 
 		err = sdrv->probe(sdev, &sdev->id);
@@ -861,13 +861,13 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m)
 	case SSB_PLLTYPE_2: /* 48Mhz, 4 dividers */
 		n1 += SSB_CHIPCO_CLK_T2_BIAS;
 		n2 += SSB_CHIPCO_CLK_T2_BIAS;
-		SSB_WARN_ON(!((n1 >= 2) && (n1 <= 7)));
-		SSB_WARN_ON(!((n2 >= 5) && (n2 <= 23)));
+		WARN_ON(!((n1 >= 2) && (n1 <= 7)));
+		WARN_ON(!((n2 >= 5) && (n2 <= 23)));
 		break;
 	case SSB_PLLTYPE_5: /* 25Mhz, 4 dividers */
 		return 100000000;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 
 	switch (plltype) {
@@ -916,9 +916,9 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m)
 		m1 += SSB_CHIPCO_CLK_T2_BIAS;
 		m2 += SSB_CHIPCO_CLK_T2M2_BIAS;
 		m3 += SSB_CHIPCO_CLK_T2_BIAS;
-		SSB_WARN_ON(!((m1 >= 2) && (m1 <= 7)));
-		SSB_WARN_ON(!((m2 >= 3) && (m2 <= 10)));
-		SSB_WARN_ON(!((m3 >= 2) && (m3 <= 7)));
+		WARN_ON(!((m1 >= 2) && (m1 <= 7)));
+		WARN_ON(!((m2 >= 3) && (m2 <= 10)));
+		WARN_ON(!((m3 >= 2) && (m3 <= 7)));
 
 		if (!(mc & SSB_CHIPCO_CLK_T2MC_M1BYP))
 			clock /= m1;
@@ -928,7 +928,7 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m)
 			clock /= m3;
 		return clock;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	return 0;
 }
@@ -1169,9 +1169,7 @@ int ssb_bus_may_powerdown(struct ssb_bus *bus)
 	if (err)
 		goto error;
 out:
-#ifdef CONFIG_SSB_DEBUG
 	bus->powered_up = 0;
-#endif
 	return err;
 error:
 	pr_err("Bus powerdown failed\n");
@@ -1188,9 +1186,7 @@ int ssb_bus_powerup(struct ssb_bus *bus, bool dynamic_pctl)
 	if (err)
 		goto error;
 
-#ifdef CONFIG_SSB_DEBUG
 	bus->powered_up = 1;
-#endif
 
 	mode = dynamic_pctl ? SSB_CLKMODE_DYNAMIC : SSB_CLKMODE_FAST;
 	ssb_chipco_set_clockmode(&bus->chipco, mode);
@@ -1242,15 +1238,15 @@ u32 ssb_admatch_base(u32 adm)
 		base = (adm & SSB_ADM_BASE0);
 		break;
 	case SSB_ADM_TYPE1:
-		SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
+		WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
 		base = (adm & SSB_ADM_BASE1);
 		break;
 	case SSB_ADM_TYPE2:
-		SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
+		WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
 		base = (adm & SSB_ADM_BASE2);
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 
 	return base;
@@ -1266,15 +1262,15 @@ u32 ssb_admatch_size(u32 adm)
 		size = ((adm & SSB_ADM_SZ0) >> SSB_ADM_SZ0_SHIFT);
 		break;
 	case SSB_ADM_TYPE1:
-		SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
+		WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
 		size = ((adm & SSB_ADM_SZ1) >> SSB_ADM_SZ1_SHIFT);
 		break;
 	case SSB_ADM_TYPE2:
-		SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
+		WARN_ON(adm & SSB_ADM_NEG); /* unsupported */
 		size = ((adm & SSB_ADM_SZ2) >> SSB_ADM_SZ2_SHIFT);
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	size = (1 << (size + 1));
 
diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c
index ad4308529eba..84807a9b4b13 100644
--- a/drivers/ssb/pci.c
+++ b/drivers/ssb/pci.c
@@ -946,7 +946,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_SSB_DEBUG
 static int ssb_pci_assert_buspower(struct ssb_bus *bus)
 {
 	if (likely(bus->powered_up))
@@ -960,12 +959,6 @@ static int ssb_pci_assert_buspower(struct ssb_bus *bus)
 
 	return -ENODEV;
 }
-#else /* DEBUG */
-static inline int ssb_pci_assert_buspower(struct ssb_bus *bus)
-{
-	return 0;
-}
-#endif /* DEBUG */
 
 static u8 ssb_pci_read8(struct ssb_device *dev, u16 offset)
 {
@@ -1024,15 +1017,15 @@ static void ssb_pci_block_read(struct ssb_device *dev, void *buffer,
 		ioread8_rep(addr, buffer, count);
 		break;
 	case sizeof(u16):
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		ioread16_rep(addr, buffer, count >> 1);
 		break;
 	case sizeof(u32):
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		ioread32_rep(addr, buffer, count >> 2);
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 
 	return;
@@ -1098,15 +1091,15 @@ static void ssb_pci_block_write(struct ssb_device *dev, const void *buffer,
 		iowrite8_rep(addr, buffer, count);
 		break;
 	case sizeof(u16):
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		iowrite16_rep(addr, buffer, count >> 1);
 		break;
 	case sizeof(u32):
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		iowrite32_rep(addr, buffer, count >> 2);
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 }
 #endif /* CONFIG_SSB_BLOCKIO */
diff --git a/drivers/ssb/pcmcia.c b/drivers/ssb/pcmcia.c
index 20f63cc88e70..567013f8a8be 100644
--- a/drivers/ssb/pcmcia.c
+++ b/drivers/ssb/pcmcia.c
@@ -169,7 +169,7 @@ int ssb_pcmcia_switch_segment(struct ssb_bus *bus, u8 seg)
 	int err;
 	u8 val;
 
-	SSB_WARN_ON((seg != 0) && (seg != 1));
+	WARN_ON((seg != 0) && (seg != 1));
 	while (1) {
 		err = ssb_pcmcia_cfg_write(bus, SSB_PCMCIA_MEMSEG, seg);
 		if (err)
@@ -299,7 +299,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer,
 	case sizeof(u16): {
 		__le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		while (count) {
 			*buf = (__force __le16)__raw_readw(addr);
 			buf++;
@@ -310,7 +310,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer,
 	case sizeof(u32): {
 		__le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		while (count) {
 			*buf = (__force __le16)__raw_readw(addr);
 			buf++;
@@ -321,7 +321,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer,
 		break;
 	}
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 unlock:
 	spin_unlock_irqrestore(&bus->bar_lock, flags);
@@ -399,7 +399,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer,
 	case sizeof(u16): {
 		const __le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		while (count) {
 			__raw_writew((__force u16)(*buf), addr);
 			buf++;
@@ -410,7 +410,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer,
 	case sizeof(u32): {
 		const __le16 *buf = buffer;
 
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		while (count) {
 			__raw_writew((__force u16)(*buf), addr);
 			buf++;
@@ -421,7 +421,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer,
 		break;
 	}
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 unlock:
 	mmiowb();
diff --git a/drivers/ssb/scan.c b/drivers/ssb/scan.c
index 71fbb4b3eb6a..6ceee98ed6ff 100644
--- a/drivers/ssb/scan.c
+++ b/drivers/ssb/scan.c
@@ -210,7 +210,7 @@ void ssb_iounmap(struct ssb_bus *bus)
 #ifdef CONFIG_SSB_PCIHOST
 		pci_iounmap(bus->host_pci, bus->mmio);
 #else
-		SSB_BUG_ON(1); /* Can't reach this code. */
+		WARN_ON(1); /* Can't reach this code. */
 #endif
 		break;
 	case SSB_BUSTYPE_SDIO:
@@ -236,7 +236,7 @@ static void __iomem *ssb_ioremap(struct ssb_bus *bus,
 #ifdef CONFIG_SSB_PCIHOST
 		mmio = pci_iomap(bus->host_pci, 0, ~0UL);
 #else
-		SSB_BUG_ON(1); /* Can't reach this code. */
+		WARN_ON(1); /* Can't reach this code. */
 #endif
 		break;
 	case SSB_BUSTYPE_SDIO:
diff --git a/drivers/ssb/sdio.c b/drivers/ssb/sdio.c
index 1aedc5fbb62f..7fe0afb42234 100644
--- a/drivers/ssb/sdio.c
+++ b/drivers/ssb/sdio.c
@@ -316,18 +316,18 @@ static void ssb_sdio_block_read(struct ssb_device *dev, void *buffer,
 		break;
 	}
 	case sizeof(u16): {
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		error = sdio_readsb(bus->host_sdio, buffer, offset, count);
 		break;
 	}
 	case sizeof(u32): {
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		offset |= SBSDIO_SB_ACCESS_2_4B_FLAG;	/* 32 bit data access */
 		error = sdio_readsb(bus->host_sdio, buffer, offset, count);
 		break;
 	}
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	if (!error)
 		goto out;
@@ -423,18 +423,18 @@ static void ssb_sdio_block_write(struct ssb_device *dev, const void *buffer,
 				     (void *)buffer, count);
 		break;
 	case sizeof(u16):
-		SSB_WARN_ON(count & 1);
+		WARN_ON(count & 1);
 		error = sdio_writesb(bus->host_sdio, offset,
 				     (void *)buffer, count);
 		break;
 	case sizeof(u32):
-		SSB_WARN_ON(count & 3);
+		WARN_ON(count & 3);
 		offset |= SBSDIO_SB_ACCESS_2_4B_FLAG;	/* 32 bit data access */
 		error = sdio_writesb(bus->host_sdio, offset,
 				     (void *)buffer, count);
 		break;
 	default:
-		SSB_WARN_ON(1);
+		WARN_ON(1);
 	}
 	if (!error)
 		goto out;
diff --git a/drivers/ssb/ssb_private.h b/drivers/ssb/ssb_private.h
index 885e278c4487..5f31bdfbe77f 100644
--- a/drivers/ssb/ssb_private.h
+++ b/drivers/ssb/ssb_private.h
@@ -9,15 +9,6 @@
 #include <linux/types.h>
 #include <linux/bcm47xx_wdt.h>
 
-#ifdef CONFIG_SSB_DEBUG
-# define SSB_WARN_ON(x)		WARN_ON(x)
-# define SSB_BUG_ON(x)		BUG_ON(x)
-#else
-static inline int __ssb_do_nothing(int x) { return x; }
-# define SSB_WARN_ON(x)		__ssb_do_nothing(unlikely(!!(x)))
-# define SSB_BUG_ON(x)		__ssb_do_nothing(unlikely(!!(x)))
-#endif
-
 
 /* pci.c */
 #ifdef CONFIG_SSB_PCIHOST
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 3b43655cabe6..0d5a2691e7e9 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -499,11 +499,9 @@ struct ssb_bus {
 
 	/* Internal-only stuff follows. Do not touch. */
 	struct list_head list;
-#ifdef CONFIG_SSB_DEBUG
 	/* Is the bus already powered up? */
 	bool powered_up;
 	int power_warn_count;
-#endif /* DEBUG */
 };
 
 enum ssb_quirks {
-- 
cgit v1.2.3


From 624c0f0239f04cce78c86afa95eb2841c84fbab1 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 9 Aug 2018 15:38:38 +0200
Subject: phylink: add helper for configuring 2500BaseX modes

Add a helper for MAC drivers to use in their validate callback to deal
with 2500BaseX vs 1000BaseX modes, where the hardware supports both
but it is not possible to automatically select between them.

This helper defaults to 1000BaseX, as that is the 802.3 standard, and
will allow users to select 2500BaseX either by forcing the speed if
AN is disabled, or by changing the advertising mask if AN is enabled.
Disabling AN is not recommended as it is only the speed that we're
interested in controlling, not the duplex or pause mode parameters.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 30 ++++++++++++++++++++++++++++++
 include/linux/phylink.h   |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index af4dc4425be2..3ba5cf2a8a5f 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1688,4 +1688,34 @@ static const struct sfp_upstream_ops sfp_phylink_ops = {
 	.disconnect_phy = phylink_sfp_disconnect_phy,
 };
 
+/* Helpers for MAC drivers */
+
+/**
+ * phylink_helper_basex_speed() - 1000BaseX/2500BaseX helper
+ * @state: a pointer to a &struct phylink_link_state
+ *
+ * Inspect the interface mode, advertising mask or forced speed and
+ * decide whether to run at 2.5Gbit or 1Gbit appropriately, switching
+ * the interface mode to suit.  @state->interface is appropriately
+ * updated, and the advertising mask has the "other" baseX_Full flag
+ * cleared.
+ */
+void phylink_helper_basex_speed(struct phylink_link_state *state)
+{
+	if (phy_interface_mode_is_8023z(state->interface)) {
+		bool want_2500 = state->an_enabled ?
+			phylink_test(state->advertising, 2500baseX_Full) :
+			state->speed == SPEED_2500;
+
+		if (want_2500) {
+			phylink_clear(state->advertising, 1000baseX_Full);
+			state->interface = PHY_INTERFACE_MODE_2500BASEX;
+		} else {
+			phylink_clear(state->advertising, 2500baseX_Full);
+			state->interface = PHY_INTERFACE_MODE_1000BASEX;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_helper_basex_speed);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 50eeae025f1e..021fc6595856 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -234,5 +234,6 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int);
 #define phylink_test(bm, mode)	__phylink_do_bit(test_bit, bm, mode)
 
 void phylink_set_port_modes(unsigned long *bits);
+void phylink_helper_basex_speed(struct phylink_link_state *state);
 
 #endif
-- 
cgit v1.2.3


From 48ae5554a076c1bca31448d60263e4038def9f6f Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Wed, 8 Aug 2018 09:04:29 +0100
Subject: net: stmmac: Add XGMAC 2.10 HWIF entry

Add a new entry to HWIF table for XGMAC 2.10. For now we fill it with
empty callbacks which will be added in posterior patches.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h | 14 +++++++------
 drivers/net/ethernet/stmicro/stmmac/hwif.c   | 31 ++++++++++++++++++++++++++--
 include/linux/stmmac.h                       |  1 +
 3 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 78fd0f8b8e81..3fb81acbd274 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -36,12 +36,14 @@
 #include "mmc.h"
 
 /* Synopsys Core versions */
-#define	DWMAC_CORE_3_40	0x34
-#define	DWMAC_CORE_3_50	0x35
-#define	DWMAC_CORE_4_00	0x40
-#define DWMAC_CORE_4_10	0x41
-#define DWMAC_CORE_5_00 0x50
-#define DWMAC_CORE_5_10 0x51
+#define	DWMAC_CORE_3_40		0x34
+#define	DWMAC_CORE_3_50		0x35
+#define	DWMAC_CORE_4_00		0x40
+#define DWMAC_CORE_4_10		0x41
+#define DWMAC_CORE_5_00		0x50
+#define DWMAC_CORE_5_10		0x51
+#define DWXGMAC_CORE_2_10	0x21
+
 #define STMMAC_CHAN0	0	/* Always supported and default for all chips */
 
 /* These need to be power of two, and >= 4 */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 1f50e83cafb2..24f5ff175aa4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -72,6 +72,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 static const struct stmmac_hwif_entry {
 	bool gmac;
 	bool gmac4;
+	bool xgmac;
 	u32 min_id;
 	const struct stmmac_regs_off regs;
 	const void *desc;
@@ -87,6 +88,7 @@ static const struct stmmac_hwif_entry {
 	{
 		.gmac = false,
 		.gmac4 = false,
+		.xgmac = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -103,6 +105,7 @@ static const struct stmmac_hwif_entry {
 	}, {
 		.gmac = true,
 		.gmac4 = false,
+		.xgmac = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -119,6 +122,7 @@ static const struct stmmac_hwif_entry {
 	}, {
 		.gmac = false,
 		.gmac4 = true,
+		.xgmac = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -135,6 +139,7 @@ static const struct stmmac_hwif_entry {
 	}, {
 		.gmac = false,
 		.gmac4 = true,
+		.xgmac = false,
 		.min_id = DWMAC_CORE_4_00,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -151,6 +156,7 @@ static const struct stmmac_hwif_entry {
 	}, {
 		.gmac = false,
 		.gmac4 = true,
+		.xgmac = false,
 		.min_id = DWMAC_CORE_4_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -167,6 +173,7 @@ static const struct stmmac_hwif_entry {
 	}, {
 		.gmac = false,
 		.gmac4 = true,
+		.xgmac = false,
 		.min_id = DWMAC_CORE_5_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -180,11 +187,29 @@ static const struct stmmac_hwif_entry {
 		.tc = &dwmac510_tc_ops,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
-	}
+	}, {
+		.gmac = false,
+		.gmac4 = false,
+		.xgmac = true,
+		.min_id = DWXGMAC_CORE_2_10,
+		.regs = {
+			.ptp_off = 0,
+			.mmc_off = 0,
+		},
+		.desc = NULL,
+		.dma = NULL,
+		.mac = NULL,
+		.hwtimestamp = NULL,
+		.mode = NULL,
+		.tc = NULL,
+		.setup = NULL,
+		.quirks = NULL,
+	},
 };
 
 int stmmac_hwif_init(struct stmmac_priv *priv)
 {
+	bool needs_xgmac = priv->plat->has_xgmac;
 	bool needs_gmac4 = priv->plat->has_gmac4;
 	bool needs_gmac = priv->plat->has_gmac;
 	const struct stmmac_hwif_entry *entry;
@@ -195,7 +220,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 
 	if (needs_gmac) {
 		id = stmmac_get_id(priv, GMAC_VERSION);
-	} else if (needs_gmac4) {
+	} else if (needs_gmac4 || needs_xgmac) {
 		id = stmmac_get_id(priv, GMAC4_VERSION);
 	} else {
 		id = 0;
@@ -229,6 +254,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 			continue;
 		if (needs_gmac4 ^ entry->gmac4)
 			continue;
+		if (needs_xgmac ^ entry->xgmac)
+			continue;
 		/* Use synopsys_id var because some setups can override this */
 		if (priv->synopsys_id < entry->min_id)
 			continue;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 32feac5bbd75..c43e9a01b892 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -190,5 +190,6 @@ struct plat_stmmacenet_data {
 	bool tso_en;
 	int mac_port_sel_speed;
 	bool en_tx_lpi_clockgating;
+	int has_xgmac;
 };
 #endif
-- 
cgit v1.2.3


From 2d2917f7747805a1f4188672f308d82a8ba01700 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 9 Aug 2018 14:04:14 -0600
Subject: PCI: Export pcie_has_flr()

pcie_flr() suggests pcie_has_flr() to ensure that PCIe FLR support is
present prior to calling.  pcie_flr() is exported while pcie_has_flr()
is not.  Resolve this.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 3 ++-
 include/linux/pci.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d123c2b173da..85e5b80a69a7 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4039,7 +4039,7 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
  * Returns true if the device advertises support for PCIe function level
  * resets.
  */
-static bool pcie_has_flr(struct pci_dev *dev)
+bool pcie_has_flr(struct pci_dev *dev)
 {
 	u32 cap;
 
@@ -4049,6 +4049,7 @@ static bool pcie_has_flr(struct pci_dev *dev)
 	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &cap);
 	return cap & PCI_EXP_DEVCAP_FLR;
 }
+EXPORT_SYMBOL_GPL(pcie_has_flr);
 
 /**
  * pcie_flr - initiate a PCIe function level reset
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 307b336496f6..bace761deff2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1090,6 +1090,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
 void pcie_print_link_status(struct pci_dev *dev);
+bool pcie_has_flr(struct pci_dev *dev);
 int pcie_flr(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
-- 
cgit v1.2.3


From 5e7baf0fcb2a3aef7329f3c7543d4695a46bd321 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 9 Aug 2018 11:13:49 -0700
Subject: qed/qede: Multi CoS support.

This patch adds support for tc mqprio offload,
using this different traffic classes on the adapter
can be utilized based on configured priority to tc map.

For example -

tc qdisc add dev eth0 root mqprio num_tc 4 map 0 1 2 3

This will cause SKBs with priority 0,1,2,3 to transmit
over tc 0,1,2,3 hardware queues respectively.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        |   9 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c      |   5 +-
 drivers/net/ethernet/qlogic/qede/qede.h         |  13 +++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  48 ++++++--
 drivers/net/ethernet/qlogic/qede/qede_fp.c      |  29 +++--
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 139 +++++++++++++++++++-----
 include/linux/qed/qed_eth_if.h                  |   6 +
 7 files changed, 200 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 5ede6408649d..82a1bd1f8a8c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2188,16 +2188,17 @@ out:
 static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 				 struct qed_dev_eth_info *info)
 {
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
 	int i;
 
 	memset(info, 0, sizeof(*info));
 
-	info->num_tc = 1;
-
 	if (IS_PF(cdev)) {
 		int max_vf_vlan_filters = 0;
 		int max_vf_mac_filters = 0;
 
+		info->num_tc = p_hwfn->hw_info.num_hw_tc;
+
 		if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
 			u16 num_queues = 0;
 
@@ -2248,6 +2249,8 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 	} else {
 		u16 total_cids = 0;
 
+		info->num_tc = 1;
+
 		/* Determine queues &  XDP support */
 		for_each_hwfn(cdev, i) {
 			struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
@@ -2554,7 +2557,7 @@ static int qed_start_txq(struct qed_dev *cdev,
 
 	rc = qed_eth_tx_queue_start(p_hwfn,
 				    p_hwfn->hw_info.opaque_fid,
-				    p_params, 0,
+				    p_params, p_params->tc,
 				    pbl_addr, pbl_size, ret_params);
 
 	if (rc) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index dbe81310c0b6..2094d86a7a08 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -948,13 +948,14 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 		params->eth_pf_params.num_arfs_filters = 0;
 
 	/* In case we might support RDMA, don't allow qede to be greedy
-	 * with the L2 contexts. Allow for 64 queues [rx, tx, xdp] per hwfn.
+	 * with the L2 contexts. Allow for 64 queues [rx, tx cos, xdp]
+	 * per hwfn.
 	 */
 	if (QED_IS_RDMA_PERSONALITY(QED_LEADING_HWFN(cdev))) {
 		u16 *num_cons;
 
 		num_cons = &params->eth_pf_params.num_cons;
-		*num_cons = min_t(u16, *num_cons, 192);
+		*num_cons = min_t(u16, *num_cons, QED_MAX_L2_CONS);
 	}
 
 	for (i = 0; i < cdev->num_hwfns; i++) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index d7ed0d3dbf71..e90c60a8fb01 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -386,6 +386,15 @@ struct qede_tx_queue {
 #define QEDE_TXQ_XDP_TO_IDX(edev, txq)	((txq)->index - \
 					 QEDE_MAX_TSS_CNT(edev))
 #define QEDE_TXQ_IDX_TO_XDP(edev, idx)	((idx) + QEDE_MAX_TSS_CNT(edev))
+#define QEDE_NDEV_TXQ_ID_TO_FP_ID(edev, idx)	((edev)->fp_num_rx + \
+						 ((idx) % QEDE_TSS_COUNT(edev)))
+#define QEDE_NDEV_TXQ_ID_TO_TXQ_COS(edev, idx)	((idx) / QEDE_TSS_COUNT(edev))
+#define QEDE_TXQ_TO_NDEV_TXQ_ID(edev, txq)	((QEDE_TSS_COUNT(edev) * \
+						 (txq)->cos) + (txq)->index)
+#define QEDE_NDEV_TXQ_ID_TO_TXQ(edev, idx)	\
+	(&((edev)->fp_array[QEDE_NDEV_TXQ_ID_TO_FP_ID(edev, idx)].txq \
+	[QEDE_NDEV_TXQ_ID_TO_TXQ_COS(edev, idx)]))
+#define QEDE_FP_TC0_TXQ(fp)	(&((fp)->txq[0]))
 
 	/* Regular Tx requires skb + metadata for release purpose,
 	 * while XDP requires the pages and the mapped address.
@@ -399,6 +408,8 @@ struct qede_tx_queue {
 
 	/* Slowpath; Should be kept in end [unless missing padding] */
 	void *handle;
+	u16 cos;
+	u16 ndev_txq_id;
 };
 
 #define BD_UNMAP_ADDR(bd)		HILO_U64(le32_to_cpu((bd)->addr.hi), \
@@ -541,5 +552,7 @@ void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq);
 #define QEDE_RX_HDR_SIZE		256
 #define QEDE_MAX_JUMBO_PACKET_SIZE	9600
 #define	for_each_queue(i) for (i = 0; i < edev->num_queues; i++)
+#define for_each_cos_in_txq(edev, var) \
+	for ((var) = 0; (var) < (edev)->dev_info.num_tc; (var)++)
 
 #endif /* _QEDE_H_ */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index b37857f3f950..2bd84d6d9b15 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -222,7 +222,7 @@ static void qede_get_strings_stats_txq(struct qede_dev *edev,
 				QEDE_TXQ_XDP_TO_IDX(edev, txq),
 				qede_tqstats_arr[i].string);
 		else
-			sprintf(*buf, "%d: %s", txq->index,
+			sprintf(*buf, "%d_%d: %s", txq->index, txq->cos,
 				qede_tqstats_arr[i].string);
 		*buf += ETH_GSTRING_LEN;
 	}
@@ -262,8 +262,13 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 		if (fp->type & QEDE_FASTPATH_XDP)
 			qede_get_strings_stats_txq(edev, fp->xdp_tx, &buf);
 
-		if (fp->type & QEDE_FASTPATH_TX)
-			qede_get_strings_stats_txq(edev, fp->txq, &buf);
+		if (fp->type & QEDE_FASTPATH_TX) {
+			int cos;
+
+			for_each_cos_in_txq(edev, cos)
+				qede_get_strings_stats_txq(edev,
+							   &fp->txq[cos], &buf);
+		}
 	}
 
 	/* Account for non-queue statistics */
@@ -338,8 +343,12 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 		if (fp->type & QEDE_FASTPATH_XDP)
 			qede_get_ethtool_stats_txq(fp->xdp_tx, &buf);
 
-		if (fp->type & QEDE_FASTPATH_TX)
-			qede_get_ethtool_stats_txq(fp->txq, &buf);
+		if (fp->type & QEDE_FASTPATH_TX) {
+			int cos;
+
+			for_each_cos_in_txq(edev, cos)
+				qede_get_ethtool_stats_txq(&fp->txq[cos], &buf);
+		}
 	}
 
 	for (i = 0; i < QEDE_NUM_STATS; i++) {
@@ -366,7 +375,8 @@ static int qede_get_sset_count(struct net_device *dev, int stringset)
 				num_stats--;
 
 		/* Account for the Regular Tx statistics */
-		num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
+		num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS *
+				edev->dev_info.num_tc;
 
 		/* Account for the Regular Rx statistics */
 		num_stats += QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS;
@@ -741,9 +751,17 @@ static int qede_get_coalesce(struct net_device *dev,
 		}
 
 		for_each_queue(i) {
+			struct qede_tx_queue *txq;
+
 			fp = &edev->fp_array[i];
+
+			/* All TX queues of given fastpath uses same
+			 * coalescing value, so no need to iterate over
+			 * all TCs, TC0 txq should suffice.
+			 */
 			if (fp->type & QEDE_FASTPATH_TX) {
-				tx_handle = fp->txq->handle;
+				txq = QEDE_FP_TC0_TXQ(fp);
+				tx_handle = txq->handle;
 				break;
 			}
 		}
@@ -801,9 +819,17 @@ static int qede_set_coalesce(struct net_device *dev,
 		}
 
 		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
+			struct qede_tx_queue *txq;
+
+			/* All TX queues of given fastpath uses same
+			 * coalescing value, so no need to iterate over
+			 * all TCs, TC0 txq should suffice.
+			 */
+			txq = QEDE_FP_TC0_TXQ(fp);
+
 			rc = edev->ops->common->set_coalesce(edev->cdev,
 							     0, txc,
-							     fp->txq->handle);
+							     txq->handle);
 			if (rc) {
 				DP_INFO(edev,
 					"Set TX coalesce error, rc = %d\n", rc);
@@ -1385,8 +1411,10 @@ static int qede_selftest_transmit_traffic(struct qede_dev *edev,
 	u16 val;
 
 	for_each_queue(i) {
-		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			txq = edev->fp_array[i].txq;
+		struct qede_fastpath *fp = &edev->fp_array[i];
+
+		if (fp->type & QEDE_FASTPATH_TX) {
+			txq = QEDE_FP_TC0_TXQ(fp);
 			break;
 		}
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 8c9e95ba9917..1a78027de071 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -408,12 +408,12 @@ static void qede_xdp_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq)
 
 static int qede_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq)
 {
+	unsigned int pkts_compl = 0, bytes_compl = 0;
 	struct netdev_queue *netdev_txq;
 	u16 hw_bd_cons;
-	unsigned int pkts_compl = 0, bytes_compl = 0;
 	int rc;
 
-	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index);
+	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->ndev_txq_id);
 
 	hw_bd_cons = le16_to_cpu(*txq->hw_cons_ptr);
 	barrier();
@@ -1365,9 +1365,14 @@ static bool qede_poll_is_more_work(struct qede_fastpath *fp)
 		if (qede_txq_has_work(fp->xdp_tx))
 			return true;
 
-	if (likely(fp->type & QEDE_FASTPATH_TX))
-		if (qede_txq_has_work(fp->txq))
-			return true;
+	if (likely(fp->type & QEDE_FASTPATH_TX)) {
+		int cos;
+
+		for_each_cos_in_txq(fp->edev, cos) {
+			if (qede_txq_has_work(&fp->txq[cos]))
+				return true;
+		}
+	}
 
 	return false;
 }
@@ -1382,8 +1387,14 @@ int qede_poll(struct napi_struct *napi, int budget)
 	struct qede_dev *edev = fp->edev;
 	int rx_work_done = 0;
 
-	if (likely(fp->type & QEDE_FASTPATH_TX) && qede_txq_has_work(fp->txq))
-		qede_tx_int(edev, fp->txq);
+	if (likely(fp->type & QEDE_FASTPATH_TX)) {
+		int cos;
+
+		for_each_cos_in_txq(fp->edev, cos) {
+			if (qede_txq_has_work(&fp->txq[cos]))
+				qede_tx_int(edev, &fp->txq[cos]);
+		}
+	}
 
 	if ((fp->type & QEDE_FASTPATH_XDP) && qede_txq_has_work(fp->xdp_tx))
 		qede_xdp_tx_int(edev, fp->xdp_tx);
@@ -1444,8 +1455,8 @@ netdev_tx_t qede_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
 	/* Get tx-queue context and netdev index */
 	txq_index = skb_get_queue_mapping(skb);
-	WARN_ON(txq_index >= QEDE_TSS_COUNT(edev));
-	txq = edev->fp_array[edev->fp_num_rx + txq_index].txq;
+	WARN_ON(txq_index >= QEDE_TSS_COUNT(edev) * edev->dev_info.num_tc);
+	txq = QEDE_NDEV_TXQ_ID_TO_TXQ(edev, txq_index);
 	netdev_txq = netdev_get_tx_queue(ndev, txq_index);
 
 	WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1));
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 6a796040a32c..d7299afa902c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -536,6 +536,43 @@ static int qede_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return 0;
 }
 
+int qede_setup_tc(struct net_device *ndev, u8 num_tc)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	int cos, count, offset;
+
+	if (num_tc > edev->dev_info.num_tc)
+		return -EINVAL;
+
+	netdev_reset_tc(ndev);
+	netdev_set_num_tc(ndev, num_tc);
+
+	for_each_cos_in_txq(edev, cos) {
+		count = QEDE_TSS_COUNT(edev);
+		offset = cos * QEDE_TSS_COUNT(edev);
+		netdev_set_tc_queue(ndev, cos, count, offset);
+	}
+
+	return 0;
+}
+
+static int
+qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type,
+		      void *type_data)
+{
+	struct tc_mqprio_qopt *mqprio;
+
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		mqprio = type_data;
+
+		mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+		return qede_setup_tc(dev, mqprio->num_tc);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops qede_netdev_ops = {
 	.ndo_open = qede_open,
 	.ndo_stop = qede_close,
@@ -568,6 +605,7 @@ static const struct net_device_ops qede_netdev_ops = {
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer = qede_rx_flow_steer,
 #endif
+	.ndo_setup_tc = qede_setup_tc_offload,
 };
 
 static const struct net_device_ops qede_netdev_vf_ops = {
@@ -621,7 +659,8 @@ static struct qede_dev *qede_alloc_etherdev(struct qed_dev *cdev,
 	struct qede_dev *edev;
 
 	ndev = alloc_etherdev_mqs(sizeof(*edev),
-				  info->num_queues, info->num_queues);
+				  info->num_queues * info->num_tc,
+				  info->num_queues);
 	if (!ndev) {
 		pr_err("etherdev allocation failed\n");
 		return NULL;
@@ -830,7 +869,8 @@ static int qede_alloc_fp_array(struct qede_dev *edev)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			fp->txq = kzalloc(sizeof(*fp->txq), GFP_KERNEL);
+			fp->txq = kcalloc(edev->dev_info.num_tc,
+					  sizeof(*fp->txq), GFP_KERNEL);
 			if (!fp->txq)
 				goto err;
 		}
@@ -879,10 +919,15 @@ static void qede_sp_task(struct work_struct *work)
 static void qede_update_pf_params(struct qed_dev *cdev)
 {
 	struct qed_pf_params pf_params;
+	u16 num_cons;
 
 	/* 64 rx + 64 tx + 64 XDP */
 	memset(&pf_params, 0, sizeof(struct qed_pf_params));
-	pf_params.eth_pf_params.num_cons = (MAX_SB_PER_PF_MIMD - 1) * 3;
+
+	/* 1 rx + 1 xdp + max tx cos */
+	num_cons = QED_MIN_L2_CONS;
+
+	pf_params.eth_pf_params.num_cons = (MAX_SB_PER_PF_MIMD - 1) * num_cons;
 
 	/* Same for VFs - make sure they'll have sufficient connections
 	 * to support XDP Tx queues.
@@ -1363,8 +1408,12 @@ static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 	if (fp->type & QEDE_FASTPATH_XDP)
 		qede_free_mem_txq(edev, fp->xdp_tx);
 
-	if (fp->type & QEDE_FASTPATH_TX)
-		qede_free_mem_txq(edev, fp->txq);
+	if (fp->type & QEDE_FASTPATH_TX) {
+		int cos;
+
+		for_each_cos_in_txq(edev, cos)
+			qede_free_mem_txq(edev, &fp->txq[cos]);
+	}
 }
 
 /* This function allocates all memory needed for a single fp (i.e. an entity
@@ -1391,9 +1440,13 @@ static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 	}
 
 	if (fp->type & QEDE_FASTPATH_TX) {
-		rc = qede_alloc_mem_txq(edev, fp->txq);
-		if (rc)
-			goto out;
+		int cos;
+
+		for_each_cos_in_txq(edev, cos) {
+			rc = qede_alloc_mem_txq(edev, &fp->txq[cos]);
+			if (rc)
+				goto out;
+		}
 	}
 
 out:
@@ -1466,10 +1519,23 @@ static void qede_init_fp(struct qede_dev *edev)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			fp->txq->index = txq_index++;
-			if (edev->dev_info.is_legacy)
-				fp->txq->is_legacy = 1;
-			fp->txq->dev = &edev->pdev->dev;
+			int cos;
+
+			for_each_cos_in_txq(edev, cos) {
+				struct qede_tx_queue *txq = &fp->txq[cos];
+				u16 ndev_tx_id;
+
+				txq->cos = cos;
+				txq->index = txq_index;
+				ndev_tx_id = QEDE_TXQ_TO_NDEV_TXQ_ID(edev, txq);
+				txq->ndev_txq_id = ndev_tx_id;
+
+				if (edev->dev_info.is_legacy)
+					txq->is_legacy = 1;
+				txq->dev = &edev->pdev->dev;
+			}
+
+			txq_index++;
 		}
 
 		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
@@ -1483,7 +1549,9 @@ static int qede_set_real_num_queues(struct qede_dev *edev)
 {
 	int rc = 0;
 
-	rc = netif_set_real_num_tx_queues(edev->ndev, QEDE_TSS_COUNT(edev));
+	rc = netif_set_real_num_tx_queues(edev->ndev,
+					  QEDE_TSS_COUNT(edev) *
+					  edev->dev_info.num_tc);
 	if (rc) {
 		DP_NOTICE(edev, "Failed to set real number of Tx queues\n");
 		return rc;
@@ -1685,9 +1753,13 @@ static int qede_stop_queues(struct qede_dev *edev)
 		fp = &edev->fp_array[i];
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			rc = qede_drain_txq(edev, fp->txq, true);
-			if (rc)
-				return rc;
+			int cos;
+
+			for_each_cos_in_txq(edev, cos) {
+				rc = qede_drain_txq(edev, &fp->txq[cos], true);
+				if (rc)
+					return rc;
+			}
 		}
 
 		if (fp->type & QEDE_FASTPATH_XDP) {
@@ -1703,9 +1775,13 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 		/* Stop the Tx Queue(s) */
 		if (fp->type & QEDE_FASTPATH_TX) {
-			rc = qede_stop_txq(edev, fp->txq, i);
-			if (rc)
-				return rc;
+			int cos;
+
+			for_each_cos_in_txq(edev, cos) {
+				rc = qede_stop_txq(edev, &fp->txq[cos], i);
+				if (rc)
+					return rc;
+			}
 		}
 
 		/* Stop the Rx Queue */
@@ -1758,6 +1834,7 @@ static int qede_start_txq(struct qede_dev *edev,
 
 	params.p_sb = fp->sb_info;
 	params.sb_idx = sb_idx;
+	params.tc = txq->cos;
 
 	rc = edev->ops->q_tx_start(edev->cdev, rss_id, &params, phys_table,
 				   page_cnt, &ret_params);
@@ -1877,9 +1954,14 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0));
-			if (rc)
-				goto out;
+			int cos;
+
+			for_each_cos_in_txq(edev, cos) {
+				rc = qede_start_txq(edev, fp, &fp->txq[cos], i,
+						    TX_PI(cos));
+				if (rc)
+					goto out;
+			}
 		}
 	}
 
@@ -1973,6 +2055,7 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode,
 		     bool is_locked)
 {
 	struct qed_link_params link_params;
+	u8 num_tc;
 	int rc;
 
 	DP_INFO(edev, "Starting qede load\n");
@@ -2019,6 +2102,10 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode,
 		goto err4;
 	DP_INFO(edev, "Start VPORT, RXQ and TXQ succeeded\n");
 
+	num_tc = netdev_get_num_tc(edev->ndev);
+	num_tc = num_tc ? num_tc : edev->dev_info.num_tc;
+	qede_setup_tc(edev->ndev, num_tc);
+
 	/* Program un-configured VLANs */
 	qede_configure_vlan_filters(edev);
 
@@ -2143,7 +2230,7 @@ static bool qede_is_txq_full(struct qede_dev *edev, struct qede_tx_queue *txq)
 {
 	struct netdev_queue *netdev_txq;
 
-	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index);
+	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->ndev_txq_id);
 	if (netif_xmit_stopped(netdev_txq))
 		return true;
 
@@ -2208,9 +2295,11 @@ static void qede_get_eth_tlv_data(void *dev, void *data)
 	for_each_queue(i) {
 		fp = &edev->fp_array[i];
 		if (fp->type & QEDE_FASTPATH_TX) {
-			if (fp->txq->sw_tx_cons != fp->txq->sw_tx_prod)
+			struct qede_tx_queue *txq = QEDE_FP_TC0_TXQ(fp);
+
+			if (txq->sw_tx_cons != txq->sw_tx_prod)
 				etlv->txqs_empty = false;
-			if (qede_is_txq_full(edev, fp->txq))
+			if (qede_is_txq_full(edev, txq))
 				etlv->num_txqs_full++;
 		}
 		if (fp->type & QEDE_FASTPATH_RX) {
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 2978fa4add42..a1310482c4ed 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -39,6 +39,10 @@
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_iov_if.h>
 
+/* 64 max queues * (1 rx + 4 tx-cos + 1 xdp) */
+#define QED_MIN_L2_CONS (2 + NUM_PHYS_TCS_4PORT_K2)
+#define QED_MAX_L2_CONS (64 * (QED_MIN_L2_CONS))
+
 struct qed_queue_start_common_params {
 	/* Should always be relative to entity sending this. */
 	u8 vport_id;
@@ -49,6 +53,8 @@ struct qed_queue_start_common_params {
 
 	struct qed_sb_info *p_sb;
 	u8 sb_idx;
+
+	u8 tc;
 };
 
 struct qed_rxq_start_ret_params {
-- 
cgit v1.2.3


From 15693fd37fc3a49da356164ed15b571c175efc34 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 8 Aug 2018 19:40:31 +0800
Subject: net: skbuff.h: fix using plain integer as NULL warning

Fixes the following sparse warning:
./include/linux/skbuff.h:2365:58: warning: Using plain integer as NULL pointer

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ebdf158a795..7e237a63a70c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2362,7 +2362,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 	if (skb_transport_header_was_set(skb))
 		return;
 
-	if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
+	if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
-- 
cgit v1.2.3


From bd2e9567db72e37f7f4b90faa5133bc7365b5f65 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 9 Aug 2018 16:19:52 -0500
Subject: PCI: Hide ACS quirk declarations inside PCI core

Move declarations for these functions:

  pci_dev_specific_acs_enabled()
  pci_dev_specific_enable_acs()

from include/linux/pci.h to drivers/pci/pci.h because nothing outside the
PCI core needs to use them.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h   | 14 ++++++++++++++
 include/linux/pci.h | 11 -----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index c358e7a07f3f..a1224fef3409 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -352,6 +352,20 @@ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
 }
 
 void pci_enable_acs(struct pci_dev *dev);
+#ifdef CONFIG_PCI_QUIRKS
+int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
+int pci_dev_specific_enable_acs(struct pci_dev *dev);
+#else
+static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
+					       u16 acs_flags)
+{
+	return -ENOTTY;
+}
+static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
+{
+	return -ENOTTY;
+}
+#endif
 
 /* PCI error reporting and recovery */
 void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..1e1ed049dd1c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1876,20 +1876,9 @@ enum pci_fixup_pass {
 
 #ifdef CONFIG_PCI_QUIRKS
 void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
-int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags);
-int pci_dev_specific_enable_acs(struct pci_dev *dev);
 #else
 static inline void pci_fixup_device(enum pci_fixup_pass pass,
 				    struct pci_dev *dev) { }
-static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev,
-					       u16 acs_flags)
-{
-	return -ENOTTY;
-}
-static inline int pci_dev_specific_enable_acs(struct pci_dev *dev)
-{
-	return -ENOTTY;
-}
 #endif
 
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
-- 
cgit v1.2.3


From b0768a86585d4d951a30ff565f19598dbbd67897 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Fri, 3 Aug 2018 16:58:09 +0900
Subject: net: Export skb_headers_offset_update

This is needed for veth XDP which does skb_copy_expand()-like operation.

v2:
- Drop skb_copy_header part because it has already been exported now.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ebdf158a795..e93b157f526c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 }
 
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
+void skb_headers_offset_update(struct sk_buff *skb, int off);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8d574a88125d..c996c09d095f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(skb_clone);
 
-static void skb_headers_offset_update(struct sk_buff *skb, int off)
+void skb_headers_offset_update(struct sk_buff *skb, int off)
 {
 	/* Only adjust this if it actually is csum_start rather than csum */
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 	skb->inner_network_header += off;
 	skb->inner_mac_header += off;
 }
+EXPORT_SYMBOL(skb_headers_offset_update);
 
 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
 {
-- 
cgit v1.2.3


From 0b19cc0a8694d4295383f88bc3441765875a57bc Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Fri, 3 Aug 2018 16:58:15 +0900
Subject: bpf: Make redirect_info accessible from modules

We are going to add kern_flags field in redirect_info for kernel
internal use.
In order to avoid function call to access the flags, make redirect_info
accessible from modules. Also as it is now non-static, add prefix bpf_
to redirect_info.

v6:
- Fix sparse warning around EXPORT_SYMBOL.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 10 ++++++++++
 net/core/filter.c      | 29 +++++++++++------------------
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c73dd7396886..4717af8b95e6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -537,6 +537,16 @@ struct sk_msg_buff {
 	struct list_head list;
 };
 
+struct bpf_redirect_info {
+	u32 ifindex;
+	u32 flags;
+	struct bpf_map *map;
+	struct bpf_map *map_to_flush;
+	unsigned long   map_owner;
+};
+
+DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+
 /* Compute the linear packet data range [data, data_end) which
  * will be accessed by various program types (cls_bpf, act_bpf,
  * lwt, ...). Subsystems allowing direct data access must (!)
diff --git a/net/core/filter.c b/net/core/filter.c
index 587bbfbd7db3..2de7dd9f2a57 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2082,19 +2082,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
-struct redirect_info {
-	u32 ifindex;
-	u32 flags;
-	struct bpf_map *map;
-	struct bpf_map *map_to_flush;
-	unsigned long   map_owner;
-};
-
-static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
 
 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return TC_ACT_SHOT;
@@ -2107,7 +2100,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 
 int skb_do_redirect(struct sk_buff *skb)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct net_device *dev;
 
 	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
@@ -3200,7 +3193,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 void xdp_do_flush_map(void)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
@@ -3245,7 +3238,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			       struct bpf_prog *xdp_prog)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
@@ -3285,7 +3278,7 @@ err:
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct net_device *fwd;
 	u32 index = ri->ifindex;
 	int err;
@@ -3317,7 +3310,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct xdp_buff *xdp,
 				       struct bpf_prog *xdp_prog)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
@@ -3368,7 +3361,7 @@ err:
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	u32 index = ri->ifindex;
 	struct net_device *fwd;
 	int err = 0;
@@ -3399,7 +3392,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
 
 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
 	if (unlikely(flags))
 		return XDP_ABORTED;
@@ -3423,7 +3416,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
 	   unsigned long, map_owner)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
 	if (unlikely(flags))
 		return XDP_ABORTED;
-- 
cgit v1.2.3


From 2539650fadbf63a431e76535a9de7bff6ea5e409 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Fri, 3 Aug 2018 16:58:16 +0900
Subject: xdp: Helpers for disabling napi_direct of xdp_return_frame

We need some mechanism to disable napi_direct on calling
xdp_return_frame_rx_napi() from some context.
When veth gets support of XDP_REDIRECT, it will redirects packets which
are redirected from other devices. On redirection veth will reuse
xdp_mem_info of the redirection source device to make return_frame work.
But in this case .ndo_xdp_xmit() called from veth redirection uses
xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit()
is not called directly from the rxq which owns the xdp_mem_info.

This approach introduces a flag in bpf_redirect_info to indicate that
napi_direct should be disabled even when _rx_napi variant is used as
well as helper functions to use it.

A NAPI handler who wants to use this flag needs to call
xdp_set_return_frame_no_direct() before processing packets, and call
xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before
exiting NAPI.

v4:
- Use bpf_redirect_info for storing the flag instead of xdp_mem_info to
  avoid per-frame copy cost.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 25 +++++++++++++++++++++++++
 net/core/xdp.c         |  6 ++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4717af8b95e6..2b072dab32c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,10 +543,14 @@ struct bpf_redirect_info {
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	unsigned long   map_owner;
+	u32 kern_flags;
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
 
+/* flags for bpf_redirect_info kern_flags */
+#define BPF_RI_F_RF_NO_DIRECT	BIT(0)	/* no napi_direct on return_frame */
+
 /* Compute the linear packet data range [data, data_end) which
  * will be accessed by various program types (cls_bpf, act_bpf,
  * lwt, ...). Subsystems allowing direct data access must (!)
@@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+static inline bool xdp_return_frame_no_direct(void)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
+}
+
+static inline void xdp_set_return_frame_no_direct(void)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
+}
+
+static inline void xdp_clear_return_frame_no_direct(void)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
+}
+
 static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
 				 unsigned int pktlen)
 {
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 57285383ed00..3dd99e1c04f5 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (xa)
+		if (xa) {
+			napi_direct &= !xdp_return_frame_no_direct();
 			page_pool_put_page(xa->page_pool, page, napi_direct);
-		else
+		} else {
 			put_page(page);
+		}
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-- 
cgit v1.2.3


From c9fbb2d25295a566b97d62e6904741e8e1702d83 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 10 Aug 2018 10:47:43 +0200
Subject: net: Provide stub for __netif_set_xps_queue if there is no CONFIG_XPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building virtio_net driver without CONFIG_XPS fails with:

    drivers/net/virtio_net.c: In function ‘virtnet_set_affinity’:
    drivers/net/virtio_net.c:1910:3: error: implicit declaration of function ‘__netif_set_xps_queue’ [-Werror=implicit-function-declaration]
       __netif_set_xps_queue(vi->dev, mask, i, false);
       ^
Fixes: 4d99f6602cb5 ("net: allow to call netif_reset_xps_queues() under cpus_read_lock")
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 282e2e95ad5b..ca5ab98053c8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3412,6 +3412,13 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 {
 	return 0;
 }
+
+static inline int __netif_set_xps_queue(struct net_device *dev,
+					const unsigned long *mask,
+					u16 index, bool is_rxqs_map)
+{
+	return 0;
+}
 #endif
 
 /**
-- 
cgit v1.2.3


From d799a4de0a250f1bdd99765bb8e55a5e2f469a1f Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 3 Aug 2018 00:52:18 +0200
Subject: gpio: mmio: Fix up inverted direction registers

The bgpio_init() takes one of two arguments to specify a register
to set the direction of the GPIO line: either dirout that
indicates that a 1 in the bit in that register sets the
corresponding line to output, or dirin which indicates that
a 1 in the bit in that register sets the corresponding line to
input. Conversely setting the bit to 0 on these will turn the
line into input and output respectively. One of these can
be defined but not both.

This means that a platform that sets a bit to 1 for output
only defines dirout and a platform that sets a bit to 0 for
output only defines dirin. In short this defines the polarity
of the direction register.

Both can also be left as NULL meaning the GPIO chip is either
input only or output only.

Tomer Maimon discovered that for get/set chips (those where the
get and set registers are defined but no separate clear register,
and specifying BGPIOF_READ_OUTPUT_REG_SET so that we say we
want to read the output value from the SET register)
we are unconditionally reading the value from the SET register
when the direction bit is 1 and from the DAT register when the
direction bit is 0, not taking the direction bit polarity into
account.

It would be expected that when the direction bit is inverted
(dirin is defined but not dirout) we read the current value from
the DAT register when the bit is 1 and from the SET register
when the bit is 0.

Currently only some versions of ATH79, brcmstb, some versions of
CLP711x, GE, IOP and Loongson use the dirin mode (a 1 in the
register means input). They are unaffected because
BGPIOF_READ_OUTPUT_REG_SET is not set on any of them. (They
do not read back the SET register to figure out the output
value.) So this is no regression with current drivers.

However the behaviour is wrong and does not work with Tomer's
new driver where he needs to use the BGIOF_READ_OUTPUT_REG_SET.
This fixes the above issue by:

- Instead of defining separate functions for the inverted case,
  set up a flag in the gpio_chip that indicates that the
  direction is inverted.
- Remove the special inverted functions for setting
  input/output and getting the direction, rely on the flag
  instead.
- Respect this flag in bgpio_get_set() and
  bgpio_get_set_multiple()

Reported-by: Tomer Maimon <tmaimon77@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-mmio.c    | 108 ++++++++++++++++++++++++++------------------
 include/linux/gpio/driver.h |   3 ++
 2 files changed, 66 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index 7b14d6280e44..935292a30c99 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -136,8 +136,20 @@ static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line)
 static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio)
 {
 	unsigned long pinmask = bgpio_line2mask(gc, gpio);
+	bool dir = !!(gc->bgpio_dir & pinmask);
 
-	if (gc->bgpio_dir & pinmask)
+	/*
+	 * If the direction is OUT we read the value from the SET
+	 * register, and if the direction is IN we read the value
+	 * from the DAT register.
+	 *
+	 * If the direction bits are inverted, naturally this gets
+	 * inverted too.
+	 */
+	if (gc->bgpio_dir_inverted)
+		dir = !dir;
+
+	if (dir)
 		return !!(gc->read_reg(gc->reg_set) & pinmask);
 	else
 		return !!(gc->read_reg(gc->reg_dat) & pinmask);
@@ -157,8 +169,13 @@ static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask,
 	*bits &= ~*mask;
 
 	/* Exploit the fact that we know which directions are set */
-	set_mask = *mask & gc->bgpio_dir;
-	get_mask = *mask & ~gc->bgpio_dir;
+	if (gc->bgpio_dir_inverted) {
+		set_mask = *mask & ~gc->bgpio_dir;
+		get_mask = *mask & gc->bgpio_dir;
+	} else {
+		set_mask = *mask & gc->bgpio_dir;
+		get_mask = *mask & ~gc->bgpio_dir;
+	}
 
 	if (set_mask)
 		*bits |= gc->read_reg(gc->reg_set) & set_mask;
@@ -359,7 +376,10 @@ static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
+	if (gc->bgpio_dir_inverted)
+		gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
+	else
+		gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -370,7 +390,10 @@ static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio)
 {
 	/* Return 0 if output, 1 of input */
-	return !(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
+	if (gc->bgpio_dir_inverted)
+		return !!(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
+	else
+		return !(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
 }
 
 static int bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
@@ -381,37 +404,10 @@ static int bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 
 	spin_lock_irqsave(&gc->bgpio_lock, flags);
 
-	gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
-	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
-
-	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
-
-	return 0;
-}
-
-static int bgpio_dir_in_inv(struct gpio_chip *gc, unsigned int gpio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&gc->bgpio_lock, flags);
-
-	gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
-	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
-
-	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
-
-	return 0;
-}
-
-static int bgpio_dir_out_inv(struct gpio_chip *gc, unsigned int gpio, int val)
-{
-	unsigned long flags;
-
-	gc->set(gc, gpio, val);
-
-	spin_lock_irqsave(&gc->bgpio_lock, flags);
-
-	gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
+	if (gc->bgpio_dir_inverted)
+		gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio);
+	else
+		gc->bgpio_dir |= bgpio_line2mask(gc, gpio);
 	gc->write_reg(gc->reg_dir, gc->bgpio_dir);
 
 	spin_unlock_irqrestore(&gc->bgpio_lock, flags);
@@ -419,12 +415,6 @@ static int bgpio_dir_out_inv(struct gpio_chip *gc, unsigned int gpio, int val)
 	return 0;
 }
 
-static int bgpio_get_dir_inv(struct gpio_chip *gc, unsigned int gpio)
-{
-	/* Return 0 if output, 1 if input */
-	return !!(gc->read_reg(gc->reg_dir) & bgpio_line2mask(gc, gpio));
-}
-
 static int bgpio_setup_accessors(struct device *dev,
 				 struct gpio_chip *gc,
 				 bool byte_be)
@@ -560,9 +550,10 @@ static int bgpio_setup_direction(struct gpio_chip *gc,
 		gc->get_direction = bgpio_get_dir;
 	} else if (dirin) {
 		gc->reg_dir = dirin;
-		gc->direction_output = bgpio_dir_out_inv;
-		gc->direction_input = bgpio_dir_in_inv;
-		gc->get_direction = bgpio_get_dir_inv;
+		gc->direction_output = bgpio_dir_out;
+		gc->direction_input = bgpio_dir_in;
+		gc->get_direction = bgpio_get_dir;
+		gc->bgpio_dir_inverted = true;
 	} else {
 		if (flags & BGPIOF_NO_OUTPUT)
 			gc->direction_output = bgpio_dir_out_err;
@@ -582,6 +573,33 @@ static int bgpio_request(struct gpio_chip *chip, unsigned gpio_pin)
 	return -EINVAL;
 }
 
+/**
+ * bgpio_init() - Initialize generic GPIO accessor functions
+ * @gc: the GPIO chip to set up
+ * @dev: the parent device of the new GPIO chip (compulsory)
+ * @sz: the size (width) of the MMIO registers in bytes, typically 1, 2 or 4
+ * @dat: MMIO address for the register to READ the value of the GPIO lines, it
+ *	is expected that a 1 in the corresponding bit in this register means the
+ *	line is asserted
+ * @set: MMIO address for the register to SET the value of the GPIO lines, it is
+ *	expected that we write the line with 1 in this register to drive the GPIO line
+ *	high.
+ * @clr: MMIO address for the register to CLEAR the value of the GPIO lines, it is
+ *	expected that we write the line with 1 in this register to drive the GPIO line
+ *	low. It is allowed to leave this address as NULL, in that case the SET register
+ *	will be assumed to also clear the GPIO lines, by actively writing the line
+ *	with 0.
+ * @dirout: MMIO address for the register to set the line as OUTPUT. It is assumed
+ *	that setting a line to 1 in this register will turn that line into an
+ *	output line. Conversely, setting the line to 0 will turn that line into
+ *	an input. Either this or @dirin can be defined, but never both.
+ * @dirin: MMIO address for the register to set this line as INPUT. It is assumed
+ *	that setting a line to 1 in this register will turn that line into an
+ *	input line. Conversely, setting the line to 0 will turn that line into
+ *	an output. Either this or @dirout can be defined, but never both.
+ * @flags: Different flags that will affect the behaviour of the device, such as
+ *	endianness etc.
+ */
 int bgpio_init(struct gpio_chip *gc, struct device *dev,
 	       unsigned long sz, void __iomem *dat, void __iomem *set,
 	       void __iomem *clr, void __iomem *dirout, void __iomem *dirin,
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 5382b5183b7e..0ea328e71ec9 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -201,6 +201,8 @@ static inline struct gpio_irq_chip *to_gpio_irq_chip(struct irq_chip *chip)
  * @reg_set: output set register (out=high) for generic GPIO
  * @reg_clr: output clear register (out=low) for generic GPIO
  * @reg_dir: direction setting register for generic GPIO
+ * @bgpio_dir_inverted: indicates that the direction register is inverted
+ *	(gpiolib private state variable)
  * @bgpio_bits: number of register bits used for a generic GPIO i.e.
  *	<register width> * 8
  * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep
@@ -267,6 +269,7 @@ struct gpio_chip {
 	void __iomem *reg_set;
 	void __iomem *reg_clr;
 	void __iomem *reg_dir;
+	bool bgpio_dir_inverted;
 	int bgpio_bits;
 	spinlock_t bgpio_lock;
 	unsigned long bgpio_data;
-- 
cgit v1.2.3


From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 8 Aug 2018 01:01:24 -0700
Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY

This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY.

To unleash the full potential of a bpf prog, it is essential for the
userspace to be capable of directly setting up a bpf map which can then
be consumed by the bpf prog to make decision.  In this case, decide which
SO_REUSEPORT sk to serve the incoming request.

By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control
and visibility on where a SO_REUSEPORT sk should be located in a bpf map.
The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that
the bpf prog can directly select a sk from the bpf map.  That will
raise the programmability of the bpf prog attached to a reuseport
group (a group of sk serving the same IP:PORT).

For example, in UDP, the bpf prog can peek into the payload (e.g.
through the "data" pointer introduced in the later patch) to learn
the application level's connection information and then decide which sk
to pick from a bpf map.  The userspace can tightly couple the sk's location
in a bpf map with the application logic in generating the UDP payload's
connection information.  This connection info contact/API stays within the
userspace.

Also, when used with map-in-map, the userspace can switch the
old-server-process's inner map to a new-server-process's inner map
in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)".
The bpf prog will then direct incoming requests to the new process instead
of the old process.  The old process can finish draining the pending
requests (e.g. by "accept()") before closing the old-fds.  [Note that
deleting a fd from a bpf map does not necessary mean the fd is closed]

During map_update_elem(),
Only SO_REUSEPORT sk (i.e. which has already been added
to a reuse->socks[]) can be used.  That means a SO_REUSEPORT sk that is
"bind()" for UDP or "bind()+listen()" for TCP.  These conditions are
ensured in "reuseport_array_update_check()".

A SO_REUSEPORT sk can only be added once to a map (i.e. the
same sk cannot be added twice even to the same map).  SO_REUSEPORT
already allows another sk to be created for the same IP:PORT.
There is no need to re-create a similar usage in the BPF side.

When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"),
it will notify the bpf map to remove it from the map also.  It is
done through "bpf_sk_reuseport_detach()" and it will only be called
if >=1 of the "reuse->sock[]" has ever been added to a bpf map.

The map_update()/map_delete() has to be in-sync with the
"reuse->socks[]".  Hence, the same "reuseport_lock" used
by "reuse->socks[]" has to be used here also. Care has
been taken to ensure the lock is only acquired when the
adding sk passes some strict tests. and
freeing the map does not require the reuseport_lock.

The reuseport_array will also support lookup from the syscall
side.  It will return a sock_gen_cookie().  The sock_gen_cookie()
is on-demand (i.e. a sk's cookie is not generated until the very
first map_lookup_elem()).

The lookup cookie is 64bits but it goes against the logical userspace
expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also).
It may catch user in surprise if we enforce value_size=8 while
userspace still pass a 32bits fd during update.  Supporting different
value_size between lookup and update seems unintuitive also.

We also need to consider what if other existing fd based maps want
to return 64bits value from syscall's lookup in the future.
Hence, reuseport_array supports both value_size 4 and 8, and
assuming user will usually use value_size=4.  The syscall's lookup
will return ENOSPC on value_size=4.  It will will only
return 64bits value from sock_gen_cookie() when user consciously
choose value_size=8 (as a signal that lookup is desired) which then
requires a 64bits value in both lookup and update.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h          |  28 ++++
 include/linux/bpf_types.h    |   3 +
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   3 +
 kernel/bpf/arraymap.c        |   2 +-
 kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |   6 +
 net/core/sock_reuseport.c    |   8 +
 8 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/reuseport_array.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..db11662faea6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 }
 
 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+int array_map_alloc_check(union bpf_attr *attr);
 
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map)
 }
 #endif
 
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags);
+#else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
+						     void *key, void *value)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
+						     void *key, void *value,
+						     u64 map_flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index add08be53b6f..14fd6c02d258 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 #endif
+#ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
+#endif
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd5758dc35d3..40f584bc7da0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_XSKMAP,
 	BPF_MAP_TYPE_SOCKHASH,
 	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e8906cbad81f..0488b8258321 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2aa55d030c77..f6ca3e712831 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 
 /* Called from syscall */
-static int array_map_alloc_check(union bpf_attr *attr)
+int array_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
new file mode 100644
index 000000000000..18e225de80ff
--- /dev/null
+++ b/kernel/bpf/reuseport_array.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/sock_diag.h>
+#include <net/sock_reuseport.h>
+
+struct reuseport_array {
+	struct bpf_map map;
+	struct sock __rcu *ptrs[];
+};
+
+static struct reuseport_array *reuseport_array(struct bpf_map *map)
+{
+	return (struct reuseport_array *)map;
+}
+
+/* The caller must hold the reuseport_lock */
+void bpf_sk_reuseport_detach(struct sock *sk)
+{
+	struct sock __rcu **socks;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	socks = sk->sk_user_data;
+	if (socks) {
+		WRITE_ONCE(sk->sk_user_data, NULL);
+		/*
+		 * Do not move this NULL assignment outside of
+		 * sk->sk_callback_lock because there is
+		 * a race with reuseport_array_free()
+		 * which does not hold the reuseport_lock.
+		 */
+		RCU_INIT_POINTER(*socks, NULL);
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int reuseport_array_alloc_check(union bpf_attr *attr)
+{
+	if (attr->value_size != sizeof(u32) &&
+	    attr->value_size != sizeof(u64))
+		return -EINVAL;
+
+	return array_map_alloc_check(attr);
+}
+
+static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return rcu_dereference(array->ptrs[index]);
+}
+
+/* Called from syscall only */
+static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = *(u32 *)key;
+	struct sock *sk;
+	int err;
+
+	if (index >= map->max_entries)
+		return -E2BIG;
+
+	if (!rcu_access_pointer(array->ptrs[index]))
+		return -ENOENT;
+
+	spin_lock_bh(&reuseport_lock);
+
+	sk = rcu_dereference_protected(array->ptrs[index],
+				       lockdep_is_held(&reuseport_lock));
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		WRITE_ONCE(sk->sk_user_data, NULL);
+		RCU_INIT_POINTER(array->ptrs[index], NULL);
+		write_unlock_bh(&sk->sk_callback_lock);
+		err = 0;
+	} else {
+		err = -ENOENT;
+	}
+
+	spin_unlock_bh(&reuseport_lock);
+
+	return err;
+}
+
+static void reuseport_array_free(struct bpf_map *map)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	struct sock *sk;
+	u32 i;
+
+	synchronize_rcu();
+
+	/*
+	 * ops->map_*_elem() will not be able to access this
+	 * array now. Hence, this function only races with
+	 * bpf_sk_reuseport_detach() which was triggerred by
+	 * close() or disconnect().
+	 *
+	 * This function and bpf_sk_reuseport_detach() are
+	 * both removing sk from "array".  Who removes it
+	 * first does not matter.
+	 *
+	 * The only concern here is bpf_sk_reuseport_detach()
+	 * may access "array" which is being freed here.
+	 * bpf_sk_reuseport_detach() access this "array"
+	 * through sk->sk_user_data _and_ with sk->sk_callback_lock
+	 * held which is enough because this "array" is not freed
+	 * until all sk->sk_user_data has stopped referencing this "array".
+	 *
+	 * Hence, due to the above, taking "reuseport_lock" is not
+	 * needed here.
+	 */
+
+	/*
+	 * Since reuseport_lock is not taken, sk is accessed under
+	 * rcu_read_lock()
+	 */
+	rcu_read_lock();
+	for (i = 0; i < map->max_entries; i++) {
+		sk = rcu_dereference(array->ptrs[i]);
+		if (sk) {
+			write_lock_bh(&sk->sk_callback_lock);
+			/*
+			 * No need for WRITE_ONCE(). At this point,
+			 * no one is reading it without taking the
+			 * sk->sk_callback_lock.
+			 */
+			sk->sk_user_data = NULL;
+			write_unlock_bh(&sk->sk_callback_lock);
+			RCU_INIT_POINTER(array->ptrs[i], NULL);
+		}
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Once reaching here, all sk->sk_user_data is not
+	 * referenceing this "array".  "array" can be freed now.
+	 */
+	bpf_map_area_free(array);
+}
+
+static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
+{
+	int err, numa_node = bpf_map_attr_numa_node(attr);
+	struct reuseport_array *array;
+	u64 cost, array_size;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	array_size = sizeof(*array);
+	array_size += (u64)attr->max_entries * sizeof(struct sock *);
+
+	/* make sure there is no u32 overflow later in round_up() */
+	cost = array_size;
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-ENOMEM);
+	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = bpf_map_precharge_memlock(cost);
+	if (err)
+		return ERR_PTR(err);
+
+	/* allocate all map elements and zero-initialize them */
+	array = bpf_map_area_alloc(array_size, numa_node);
+	if (!array)
+		return ERR_PTR(-ENOMEM);
+
+	/* copy mandatory map attributes */
+	bpf_map_init_from_attr(&array->map, attr);
+	array->map.pages = cost;
+
+	return &array->map;
+}
+
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value)
+{
+	struct sock *sk;
+	int err;
+
+	if (map->value_size != sizeof(u64))
+		return -ENOSPC;
+
+	rcu_read_lock();
+	sk = reuseport_array_lookup_elem(map, key);
+	if (sk) {
+		*(u64 *)value = sock_gen_cookie(sk);
+		err = 0;
+	} else {
+		err = -ENOENT;
+	}
+	rcu_read_unlock();
+
+	return err;
+}
+
+static int
+reuseport_array_update_check(const struct reuseport_array *array,
+			     const struct sock *nsk,
+			     const struct sock *osk,
+			     const struct sock_reuseport *nsk_reuse,
+			     u32 map_flags)
+{
+	if (osk && map_flags == BPF_NOEXIST)
+		return -EEXIST;
+
+	if (!osk && map_flags == BPF_EXIST)
+		return -ENOENT;
+
+	if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
+		return -ENOTSUPP;
+
+	if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
+		return -ENOTSUPP;
+
+	if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
+		return -ENOTSUPP;
+
+	/*
+	 * sk must be hashed (i.e. listening in the TCP case or binded
+	 * in the UDP case) and
+	 * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
+	 *
+	 * Also, sk will be used in bpf helper that is protected by
+	 * rcu_read_lock().
+	 */
+	if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
+		return -EINVAL;
+
+	/* READ_ONCE because the sk->sk_callback_lock may not be held here */
+	if (READ_ONCE(nsk->sk_user_data))
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * Called from syscall only.
+ * The "nsk" in the fd refcnt.
+ * The "osk" and "reuse" are protected by reuseport_lock.
+ */
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	struct sock *free_osk = NULL, *osk, *nsk;
+	struct sock_reuseport *reuse;
+	u32 index = *(u32 *)key;
+	struct socket *socket;
+	int err, fd;
+
+	if (map_flags > BPF_EXIST)
+		return -EINVAL;
+
+	if (index >= map->max_entries)
+		return -E2BIG;
+
+	if (map->value_size == sizeof(u64)) {
+		u64 fd64 = *(u64 *)value;
+
+		if (fd64 > S32_MAX)
+			return -EINVAL;
+		fd = fd64;
+	} else {
+		fd = *(int *)value;
+	}
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	nsk = socket->sk;
+	if (!nsk) {
+		err = -EINVAL;
+		goto put_file;
+	}
+
+	/* Quick checks before taking reuseport_lock */
+	err = reuseport_array_update_check(array, nsk,
+					   rcu_access_pointer(array->ptrs[index]),
+					   rcu_access_pointer(nsk->sk_reuseport_cb),
+					   map_flags);
+	if (err)
+		goto put_file;
+
+	spin_lock_bh(&reuseport_lock);
+	/*
+	 * Some of the checks only need reuseport_lock
+	 * but it is done under sk_callback_lock also
+	 * for simplicity reason.
+	 */
+	write_lock_bh(&nsk->sk_callback_lock);
+
+	osk = rcu_dereference_protected(array->ptrs[index],
+					lockdep_is_held(&reuseport_lock));
+	reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
+	if (err)
+		goto put_file_unlock;
+
+	/* Ensure reuse->reuseport_id is set */
+	err = reuseport_get_id(reuse);
+	if (err < 0)
+		goto put_file_unlock;
+
+	WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
+	rcu_assign_pointer(array->ptrs[index], nsk);
+	free_osk = osk;
+	err = 0;
+
+put_file_unlock:
+	write_unlock_bh(&nsk->sk_callback_lock);
+
+	if (free_osk) {
+		write_lock_bh(&free_osk->sk_callback_lock);
+		WRITE_ONCE(free_osk->sk_user_data, NULL);
+		write_unlock_bh(&free_osk->sk_callback_lock);
+	}
+
+	spin_unlock_bh(&reuseport_lock);
+put_file:
+	fput(socket->file);
+	return err;
+}
+
+/* Called from syscall */
+static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
+					void *next_key)
+{
+	struct reuseport_array *array = reuseport_array(map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= array->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == array->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops reuseport_array_ops = {
+	.map_alloc_check = reuseport_array_alloc_check,
+	.map_alloc = reuseport_array_alloc,
+	.map_free = reuseport_array_free,
+	.map_lookup_elem = reuseport_array_lookup_elem,
+	.map_get_next_key = reuseport_array_get_next_key,
+	.map_delete_elem = reuseport_array_delete_elem,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5af4e9e2722d..57f4d076141b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_fd_array_map_lookup_elem(map, key, value);
 	} else if (IS_FD_HASH(map)) {
 		err = bpf_fd_htab_map_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
@@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
 						  attr->flags);
 		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		/* rcu_read_lock() is not needed */
+		err = bpf_fd_reuseport_array_update_elem(map, key, value,
+							 attr->flags);
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index cf2e4d305af9..8235f2439816 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk)
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
+
+	/* At least one of the sk in this reuseport group is added to
+	 * a bpf map.  Notify the bpf side.  The bpf map logic will
+	 * remove the sk if it is indeed added to a bpf map.
+	 */
+	if (reuse->reuseport_id)
+		bpf_sk_reuseport_detach(sk);
+
 	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
 
 	for (i = 0; i < reuse->num_socks; i++) {
-- 
cgit v1.2.3


From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 8 Aug 2018 01:01:25 -0700
Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT

This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY.  Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.

BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].

At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp).  At this
point,  it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[].  Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp.  It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.

For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb().  Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.

Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.

The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations.  There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).

In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages.  Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb.  The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.

The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run.  It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").

The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk.  It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY.  As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()".  Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case).  The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want.  This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match).  The bpf prog can decide if it wants to do SK_DROP as its
discretion.

When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk.  If not, the kernel
will select one from "reuse->socks[]" (as before this patch).

The SK_DROP and SK_PASS handling logic will be in the next patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |   3 +
 include/linux/filter.h          |  15 +++
 include/net/addrconf.h          |   1 +
 include/net/sock_reuseport.h    |   6 +-
 include/uapi/linux/bpf.h        |  36 +++++-
 kernel/bpf/verifier.c           |   9 ++
 net/core/filter.c               | 269 +++++++++++++++++++++++++++++++++++++++-
 net/core/sock_reuseport.c       |  20 ++-
 net/ipv4/inet_connection_sock.c |   9 ++
 net/ipv4/inet_hashtables.c      |   5 +-
 net/ipv4/udp.c                  |   5 +-
 11 files changed, 365 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 14fd6c02d258..cd26c090e7c0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #endif
+#ifdef CONFIG_INET
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b072dab32c0..70e9d57677fe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -32,6 +32,7 @@ struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
 struct xdp_buff;
+struct sock_reuseport;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act);
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
 struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
 
+#ifdef CONFIG_INET
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+				  struct bpf_prog *prog, struct sk_buff *skb,
+				  u32 hash);
+#else
+static inline struct sock *
+bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+		     struct bpf_prog *prog, struct sk_buff *skb,
+		     u32 hash)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 5f43f7a70fe6..6def0351bcc3 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			  bool match_wildcard);
+bool inet_rcv_saddr_any(const struct sock *sk);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index e1a7681856f7..73b569556be6 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -21,12 +21,14 @@ struct sock_reuseport {
 	unsigned int		synq_overflow_ts;
 	/* ID stays the same even after the size of socks[] grows. */
 	unsigned int		reuseport_id;
+	bool			bind_inany;
 	struct bpf_prog __rcu	*prog;		/* optional BPF sock selector */
 	struct sock		*socks[0];	/* array of sock pointers */
 };
 
-extern int reuseport_alloc(struct sock *sk);
-extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
+extern int reuseport_alloc(struct sock *sk, bool bind_inany);
+extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
+			      bool bind_inany);
 extern void reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 40f584bc7da0..3102a2a23c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -151,6 +151,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SK_REUSEPORT,
 };
 
 enum bpf_attach_type {
@@ -2114,6 +2115,14 @@ union bpf_attr {
  *		the shared data.
  *	Return
  *		Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ *		It checks the selected sk is matching the incoming
+ *		request in the skb.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2197,7 +2206,8 @@ union bpf_attr {
 	FN(rc_keydown),			\
 	FN(skb_cgroup_id),		\
 	FN(get_current_cgroup_id),	\
-	FN(get_local_storage),
+	FN(get_local_storage),		\
+	FN(sk_select_reuseport),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2414,6 +2424,30 @@ struct sk_msg_md {
 	__u32 local_port;	/* stored in host byte order */
 };
 
+struct sk_reuseport_md {
+	/*
+	 * Start of directly accessible data. It begins from
+	 * the tcp/udp header.
+	 */
+	void *data;
+	void *data_end;		/* End of directly accessible data */
+	/*
+	 * Total length of packet (starting from the tcp/udp header).
+	 * Note that the directly accessible bytes (data_end - data)
+	 * could be less than this "len".  Those bytes could be
+	 * indirectly read by a helper "bpf_skb_load_bytes()".
+	 */
+	__u32 len;
+	/*
+	 * Eth protocol in the mac header (network byte order). e.g.
+	 * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+	 */
+	__u32 eth_protocol;
+	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+	__u32 bind_inany;	/* Is sock bound to an INANY address? */
+	__u32 hash;		/* A hash of the packet 4 tuples */
+};
+
 #define BPF_TAG_SIZE	8
 
 struct bpf_prog_info {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587468a9c37d..ca90679a7fe5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
 	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+	case BPF_PROG_TYPE_SK_REUSEPORT:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
@@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_hash)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+		if (func_id != BPF_FUNC_sk_select_reuseport)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
 			goto error;
 		break;
+	case BPF_FUNC_sk_select_reuseport:
+		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 2de7dd9f2a57..142595b4e0d1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
 		return -ENOMEM;
 
 	if (sk_unhashed(sk) && sk->sk_reuseport) {
-		err = reuseport_alloc(sk);
+		err = reuseport_alloc(sk, false);
 		if (err)
 			return err;
 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -7013,3 +7013,270 @@ out:
 	release_sock(sk);
 	return ret;
 }
+
+#ifdef CONFIG_INET
+struct sk_reuseport_kern {
+	struct sk_buff *skb;
+	struct sock *sk;
+	struct sock *selected_sk;
+	void *data_end;
+	u32 hash;
+	u32 reuseport_id;
+	bool bind_inany;
+};
+
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+				    struct sock_reuseport *reuse,
+				    struct sock *sk, struct sk_buff *skb,
+				    u32 hash)
+{
+	reuse_kern->skb = skb;
+	reuse_kern->sk = sk;
+	reuse_kern->selected_sk = NULL;
+	reuse_kern->data_end = skb->data + skb_headlen(skb);
+	reuse_kern->hash = hash;
+	reuse_kern->reuseport_id = reuse->reuseport_id;
+	reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+				  struct bpf_prog *prog, struct sk_buff *skb,
+				  u32 hash)
+{
+	struct sk_reuseport_kern reuse_kern;
+	enum sk_action action;
+
+	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+	action = BPF_PROG_RUN(prog, &reuse_kern);
+
+	if (action == SK_PASS)
+		return reuse_kern.selected_sk;
+	else
+		return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+	   struct bpf_map *, map, void *, key, u32, flags)
+{
+	struct sock_reuseport *reuse;
+	struct sock *selected_sk;
+
+	selected_sk = map->ops->map_lookup_elem(map, key);
+	if (!selected_sk)
+		return -ENOENT;
+
+	reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+	if (!reuse)
+		/* selected_sk is unhashed (e.g. by close()) after the
+		 * above map_lookup_elem().  Treat selected_sk has already
+		 * been removed from the map.
+		 */
+		return -ENOENT;
+
+	if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+		struct sock *sk;
+
+		if (unlikely(!reuse_kern->reuseport_id))
+			/* There is a small race between adding the
+			 * sk to the map and setting the
+			 * reuse_kern->reuseport_id.
+			 * Treat it as the sk has not been added to
+			 * the bpf map yet.
+			 */
+			return -ENOENT;
+
+		sk = reuse_kern->sk;
+		if (sk->sk_protocol != selected_sk->sk_protocol)
+			return -EPROTOTYPE;
+		else if (sk->sk_family != selected_sk->sk_family)
+			return -EAFNOSUPPORT;
+
+		/* Catch all. Likely bound to a different sockaddr. */
+		return -EBADFD;
+	}
+
+	reuse_kern->selected_sk = selected_sk;
+
+	return 0;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+	.func           = sk_select_reuseport,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+	   const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+	   void *, to, u32, len)
+{
+	return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+	.func		= sk_reuseport_load_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+	   const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+	   void *, to, u32, len, u32, start_header)
+{
+	return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+					       len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+	.func		= sk_reuseport_load_bytes_relative,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+			const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_sk_select_reuseport:
+		return &sk_select_reuseport_proto;
+	case BPF_FUNC_skb_load_bytes:
+		return &sk_reuseport_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &sk_reuseport_load_bytes_relative_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+			     enum bpf_access_type type,
+			     const struct bpf_prog *prog,
+			     struct bpf_insn_access_aux *info)
+{
+	const u32 size_default = sizeof(__u32);
+
+	if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+	    off % size || type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct sk_reuseport_md, data):
+		info->reg_type = PTR_TO_PACKET;
+		return size == sizeof(__u64);
+
+	case offsetof(struct sk_reuseport_md, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return size == sizeof(__u64);
+
+	case offsetof(struct sk_reuseport_md, hash):
+		return size == size_default;
+
+	/* Fields that allow narrowing */
+	case offsetof(struct sk_reuseport_md, eth_protocol):
+		if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+			return false;
+	case offsetof(struct sk_reuseport_md, ip_protocol):
+	case offsetof(struct sk_reuseport_md, bind_inany):
+	case offsetof(struct sk_reuseport_md, len):
+		bpf_ctx_record_field_size(info, size_default);
+		return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+	default:
+		return false;
+	}
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({					\
+	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+			      si->dst_reg, si->src_reg,			\
+			      bpf_target_off(struct sk_reuseport_kern, F, \
+					     FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+					     target_size));		\
+	})
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)				\
+	SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,		\
+				    struct sk_buff,			\
+				    skb,				\
+				    SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
+	SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern,	\
+					     struct sock,		\
+					     sk,			\
+					     SK_FIELD, BPF_SIZE, EXTRA_OFF)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+					   const struct bpf_insn *si,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog,
+					   u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct sk_reuseport_md, data):
+		SK_REUSEPORT_LOAD_SKB_FIELD(data);
+		break;
+
+	case offsetof(struct sk_reuseport_md, len):
+		SK_REUSEPORT_LOAD_SKB_FIELD(len);
+		break;
+
+	case offsetof(struct sk_reuseport_md, eth_protocol):
+		SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+		break;
+
+	case offsetof(struct sk_reuseport_md, ip_protocol):
+		BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+		SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
+						    BPF_W, 0);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+					SK_FL_PROTO_SHIFT);
+		/* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
+		 * aware.  No further narrowing or masking is needed.
+		 */
+		*target_size = 1;
+		break;
+
+	case offsetof(struct sk_reuseport_md, data_end):
+		SK_REUSEPORT_LOAD_FIELD(data_end);
+		break;
+
+	case offsetof(struct sk_reuseport_md, hash):
+		SK_REUSEPORT_LOAD_FIELD(hash);
+		break;
+
+	case offsetof(struct sk_reuseport_md, bind_inany):
+		SK_REUSEPORT_LOAD_FIELD(bind_inany);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+	.get_func_proto		= sk_reuseport_func_proto,
+	.is_valid_access	= sk_reuseport_is_valid_access,
+	.convert_ctx_access	= sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+#endif /* CONFIG_INET */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 8235f2439816..d260167f5f77 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 	return reuse;
 }
 
-int reuseport_alloc(struct sock *sk)
+int reuseport_alloc(struct sock *sk, bool bind_inany)
 {
 	struct sock_reuseport *reuse;
 
@@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk)
 	/* Allocation attempts can occur concurrently via the setsockopt path
 	 * and the bind/hash path.  Nothing to do when we lose the race.
 	 */
-	if (rcu_dereference_protected(sk->sk_reuseport_cb,
-				      lockdep_is_held(&reuseport_lock)))
+	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+	if (reuse) {
+		/* Only set reuse->bind_inany if the bind_inany is true.
+		 * Otherwise, it will overwrite the reuse->bind_inany
+		 * which was set by the bind/hash path.
+		 */
+		if (bind_inany)
+			reuse->bind_inany = bind_inany;
 		goto out;
+	}
 
 	reuse = __reuseport_alloc(INIT_SOCKS);
 	if (!reuse) {
@@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk)
 
 	reuse->socks[0] = sk;
 	reuse->num_socks = 1;
+	reuse->bind_inany = bind_inany;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 	more_reuse->num_socks = reuse->num_socks;
 	more_reuse->prog = reuse->prog;
 	more_reuse->reuseport_id = reuse->reuseport_id;
+	more_reuse->bind_inany = reuse->bind_inany;
 
 	memcpy(more_reuse->socks, reuse->socks,
 	       reuse->num_socks * sizeof(struct sock *));
@@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
  *  @sk2: Socket belonging to the existing reuseport group.
  *  May return ENOMEM and not add socket to group under memory pressure.
  */
-int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
 {
 	struct sock_reuseport *old_reuse, *reuse;
 
 	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
-		int err = reuseport_alloc(sk2);
+		int err = reuseport_alloc(sk2, bind_inany);
 
 		if (err)
 			return err;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 33a88e045efd..dfd5009f96ef 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 }
 EXPORT_SYMBOL(inet_rcv_saddr_equal);
 
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+	return !sk->sk_rcv_saddr;
+}
+
 void inet_get_local_port_range(struct net *net, int *low, int *high)
 {
 	unsigned int seq;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3647167c8fa3..370e24463fb7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    inet_csk(sk2)->icsk_bind_hash == tb &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false))
-			return reuseport_add_sock(sk, sk2);
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
 	}
 
-	return reuseport_alloc(sk);
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
 int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 060e841dde40..038dd7909051 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false)) {
-			return reuseport_add_sock(sk, sk2);
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
 		}
 	}
 
-	return reuseport_alloc(sk);
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 }
 
 /**
-- 
cgit v1.2.3


From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 8 Aug 2018 01:01:26 -0700
Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport
 selection

This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a
SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in
the earlier patch.  "bpf_run_sk_reuseport()" will return -ECONNREFUSED
when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP.
The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to
handle this case and return NULL immediately instead of continuing the
sk search from its hashtable.

It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach
BPF_PROG_TYPE_SK_REUSEPORT.  The "sk_reuseport_attach_bpf()" will check
if the attaching bpf prog is in the new SK_REUSEPORT or the existing
SOCKET_FILTER type and then check different things accordingly.

One level of "__reuseport_attach_prog()" call is removed.  The
"sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed
back to "reuseport_attach_prog()" in sock_reuseport.c.  sock_reuseport.c
seems to have more knowledge on those test requirements than filter.c.
In "reuseport_attach_prog()", after new_prog is attached to reuse->prog,
the old_prog (if any) is also directly freed instead of returning the
old_prog to the caller and asking the caller to free.

The sysctl_optmem_max check is moved back to the
"sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()".
As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only
bounded by the usual "bpf_prog_charge_memlock()" during load time
instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h       |  1 +
 include/net/sock_reuseport.h |  3 +-
 net/core/filter.c            | 87 ++++++++++++++++++++++++++------------------
 net/core/sock_reuseport.c    | 36 +++++++++++++-----
 net/ipv4/inet_hashtables.c   | 14 ++++---
 net/ipv4/udp.c               |  4 ++
 net/ipv6/inet6_hashtables.c  | 14 ++++---
 net/ipv6/udp.c               |  4 ++
 8 files changed, 106 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 70e9d57677fe..5d565c50bcb2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
+void sk_reuseport_prog_free(struct bpf_prog *prog);
 int sk_detach_filter(struct sock *sk);
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 73b569556be6..8a5f70c7cdf2 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -34,8 +34,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
 					  u32 hash,
 					  struct sk_buff *skb,
 					  int hdr_len);
-extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
-					      struct bpf_prog *prog);
+extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 int reuseport_get_id(struct sock_reuseport *reuse);
 
 #endif  /* _SOCK_REUSEPORT_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index 142595b4e0d1..22906b31d43f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 	return 0;
 }
 
-static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
-{
-	struct bpf_prog *old_prog;
-	int err;
-
-	if (bpf_prog_size(prog->len) > sysctl_optmem_max)
-		return -ENOMEM;
-
-	if (sk_unhashed(sk) && sk->sk_reuseport) {
-		err = reuseport_alloc(sk, false);
-		if (err)
-			return err;
-	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
-		/* The socket wasn't bound with SO_REUSEPORT */
-		return -EINVAL;
-	}
-
-	old_prog = reuseport_attach_prog(sk, prog);
-	if (old_prog)
-		bpf_prog_destroy(old_prog);
-
-	return 0;
-}
-
 static
 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
 {
@@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __reuseport_attach_prog(prog, sk);
-	if (err < 0) {
+	if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+		err = -ENOMEM;
+	else
+		err = reuseport_attach_prog(sk, prog);
+
+	if (err)
 		__bpf_prog_release(prog);
-		return err;
-	}
 
-	return 0;
+	return err;
 }
 
 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
@@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
 {
-	struct bpf_prog *prog = __get_bpf(ufd, sk);
+	struct bpf_prog *prog;
 	int err;
 
+	if (sock_flag(sk, SOCK_FILTER_LOCKED))
+		return -EPERM;
+
+	prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+	if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+		prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __reuseport_attach_prog(prog, sk);
-	if (err < 0) {
-		bpf_prog_put(prog);
-		return err;
+	if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
+		/* Like other non BPF_PROG_TYPE_SOCKET_FILTER
+		 * bpf prog (e.g. sockmap).  It depends on the
+		 * limitation imposed by bpf_prog_load().
+		 * Hence, sysctl_optmem_max is not checked.
+		 */
+		if ((sk->sk_type != SOCK_STREAM &&
+		     sk->sk_type != SOCK_DGRAM) ||
+		    (sk->sk_protocol != IPPROTO_UDP &&
+		     sk->sk_protocol != IPPROTO_TCP) ||
+		    (sk->sk_family != AF_INET &&
+		     sk->sk_family != AF_INET6)) {
+			err = -ENOTSUPP;
+			goto err_prog_put;
+		}
+	} else {
+		/* BPF_PROG_TYPE_SOCKET_FILTER */
+		if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+			err = -ENOMEM;
+			goto err_prog_put;
+		}
 	}
 
-	return 0;
+	err = reuseport_attach_prog(sk, prog);
+err_prog_put:
+	if (err)
+		bpf_prog_put(prog);
+
+	return err;
+}
+
+void sk_reuseport_prog_free(struct bpf_prog *prog)
+{
+	if (!prog)
+		return;
+
+	if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+		bpf_prog_put(prog);
+	else
+		bpf_prog_destroy(prog);
 }
 
 struct bpf_scratchpad {
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index d260167f5f77..ba5cba56f574 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -9,6 +9,7 @@
 #include <net/sock_reuseport.h>
 #include <linux/bpf.h>
 #include <linux/idr.h>
+#include <linux/filter.h>
 #include <linux/rcupdate.h>
 
 #define INIT_SOCKS 128
@@ -133,8 +134,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
 	struct sock_reuseport *reuse;
 
 	reuse = container_of(head, struct sock_reuseport, rcu);
-	if (reuse->prog)
-		bpf_prog_destroy(reuse->prog);
+	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
 	if (reuse->reuseport_id)
 		ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
 	kfree(reuse);
@@ -219,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
 
-static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
-			    struct bpf_prog *prog, struct sk_buff *skb,
-			    int hdr_len)
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+				   struct bpf_prog *prog, struct sk_buff *skb,
+				   int hdr_len)
 {
 	struct sk_buff *nskb = NULL;
 	u32 index;
@@ -282,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
 		/* paired with smp_wmb() in reuseport_add_sock() */
 		smp_rmb();
 
-		if (prog && skb)
-			sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+		if (!prog || !skb)
+			goto select_by_hash;
+
+		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+		else
+			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
+select_by_hash:
 		/* no bpf or invalid bpf result: fall back to hash usage */
 		if (!sk2)
 			sk2 = reuse->socks[reciprocal_scale(hash, socks)];
@@ -296,12 +302,21 @@ out:
 }
 EXPORT_SYMBOL(reuseport_select_sock);
 
-struct bpf_prog *
-reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 {
 	struct sock_reuseport *reuse;
 	struct bpf_prog *old_prog;
 
+	if (sk_unhashed(sk) && sk->sk_reuseport) {
+		int err = reuseport_alloc(sk, false);
+
+		if (err)
+			return err;
+	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+		/* The socket wasn't bound with SO_REUSEPORT */
+		return -EINVAL;
+	}
+
 	spin_lock_bh(&reuseport_lock);
 	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 					  lockdep_is_held(&reuseport_lock));
@@ -310,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
 	rcu_assign_pointer(reuse->prog, prog);
 	spin_unlock_bh(&reuseport_lock);
 
-	return old_prog;
+	sk_reuseport_prog_free(old_prog);
+	return 0;
 }
 EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 370e24463fb7..f5c9ef2586de 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net,
 				    saddr, sport, daddr, hnum,
 				    dif, sdif);
 	if (result)
-		return result;
+		goto done;
 
 	/* Lookup lhash2 with INADDR_ANY */
 
@@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net,
 	if (ilb2->count > ilb->count)
 		goto port_lookup;
 
-	return inet_lhash2_lookup(net, ilb2, skb, doff,
-				  saddr, sport, daddr, hnum,
-				  dif, sdif);
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, daddr, hnum,
+				    dif, sdif);
+	goto done;
 
 port_lookup:
 	sk_for_each_rcu(sk, &ilb->head) {
@@ -352,12 +353,15 @@ port_lookup:
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
-					return result;
+					goto done;
 			}
 			result = sk;
 			hiscore = score;
 		}
 	}
+done:
+	if (unlikely(IS_ERR(result)))
+		return NULL;
 	return result;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 038dd7909051..f4e35b2ff8b8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -499,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 						  daddr, hnum, dif, sdif,
 						  exact_dif, hslot2, skb);
 		}
+		if (unlikely(IS_ERR(result)))
+			return NULL;
 		return result;
 	}
 begin:
@@ -513,6 +515,8 @@ begin:
 						   saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
+				if (unlikely(IS_ERR(result)))
+					return NULL;
 				if (result)
 					return result;
 			}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 595ad408dba0..3d7c7460a0c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 				     saddr, sport, daddr, hnum,
 				     dif, sdif);
 	if (result)
-		return result;
+		goto done;
 
 	/* Lookup lhash2 with in6addr_any */
 
@@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net,
 	if (ilb2->count > ilb->count)
 		goto port_lookup;
 
-	return inet6_lhash2_lookup(net, ilb2, skb, doff,
-				   saddr, sport, daddr, hnum,
-				   dif, sdif);
+	result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+				     saddr, sport, daddr, hnum,
+				     dif, sdif);
+	goto done;
 
 port_lookup:
 	sk_for_each(sk, &ilb->head) {
@@ -214,12 +215,15 @@ port_lookup:
 				result = reuseport_select_sock(sk, phash,
 							       skb, doff);
 				if (result)
-					return result;
+					goto done;
 			}
 			result = sk;
 			hiscore = score;
 		}
 	}
+done:
+	if (unlikely(IS_ERR(result)))
+		return NULL;
 	return result;
 }
 EXPORT_SYMBOL_GPL(inet6_lookup_listener);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f6b96956a8ed..83f4c77c79d8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
 						  exact_dif, hslot2,
 						  skb);
 		}
+		if (unlikely(IS_ERR(result)))
+			return NULL;
 		return result;
 	}
 begin:
@@ -249,6 +251,8 @@ begin:
 						    saddr, sport);
 				result = reuseport_select_sock(sk, hash, skb,
 							sizeof(struct udphdr));
+				if (unlikely(IS_ERR(result)))
+					return NULL;
 				if (result)
 					return result;
 			}
-- 
cgit v1.2.3


From 19e226e8cc5da02f17ed119f9137036c0f0f5d80 Mon Sep 17 00:00:00 2001
From: Caleb Raitto <caraitto@google.com>
Date: Thu, 9 Aug 2018 18:18:28 -0700
Subject: virtio: Make vp_set_vq_affinity() take a mask.

Make vp_set_vq_affinity() take a cpumask instead of taking a single CPU.

If there are fewer queues than cores, queue affinity should be able to
map to multiple cores.

Link: https://patchwork.ozlabs.org/patch/948149/
Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Caleb Raitto <caraitto@google.com>
Acked-by: Gonglei <arei.gonglei@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/virtio/virtio_crypto_core.c | 4 ++--
 drivers/net/virtio_net.c                   | 8 ++++----
 drivers/virtio/virtio_pci_common.c         | 7 +++----
 drivers/virtio/virtio_pci_common.h         | 2 +-
 include/linux/virtio_config.h              | 7 ++++---
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 83326986c113..7c7198553699 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -146,7 +146,7 @@ static void virtcrypto_clean_affinity(struct virtio_crypto *vi, long hcpu)
 
 	if (vi->affinity_hint_set) {
 		for (i = 0; i < vi->max_data_queues; i++)
-			virtqueue_set_affinity(vi->data_vq[i].vq, -1);
+			virtqueue_set_affinity(vi->data_vq[i].vq, NULL);
 
 		vi->affinity_hint_set = false;
 	}
@@ -173,7 +173,7 @@ static void virtcrypto_set_affinity(struct virtio_crypto *vcrypto)
 	 *
 	 */
 	for_each_online_cpu(cpu) {
-		virtqueue_set_affinity(vcrypto->data_vq[i].vq, cpu);
+		virtqueue_set_affinity(vcrypto->data_vq[i].vq, cpumask_of(cpu));
 		if (++i >= vcrypto->max_data_queues)
 			break;
 	}
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 39a7f4452587..43fabc0eb4d2 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1878,8 +1878,8 @@ static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
 
 	if (vi->affinity_hint_set) {
 		for (i = 0; i < vi->max_queue_pairs; i++) {
-			virtqueue_set_affinity(vi->rq[i].vq, -1);
-			virtqueue_set_affinity(vi->sq[i].vq, -1);
+			virtqueue_set_affinity(vi->rq[i].vq, NULL);
+			virtqueue_set_affinity(vi->sq[i].vq, NULL);
 		}
 
 		vi->affinity_hint_set = false;
@@ -1905,8 +1905,8 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
 	for_each_online_cpu(cpu) {
 		const unsigned long *mask = cpumask_bits(cpumask_of(cpu));
 
-		virtqueue_set_affinity(vi->rq[i].vq, cpu);
-		virtqueue_set_affinity(vi->sq[i].vq, cpu);
+		virtqueue_set_affinity(vi->rq[i].vq, cpumask_of(cpu));
+		virtqueue_set_affinity(vi->sq[i].vq, cpumask_of(cpu));
 		__netif_set_xps_queue(vi->dev, mask, i, false);
 		i++;
 	}
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 705aebd74e56..465a6f5142cc 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -421,7 +421,7 @@ const char *vp_bus_name(struct virtio_device *vdev)
  * - OR over all affinities for shared MSI
  * - ignore the affinity request if we're using INTX
  */
-int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
+int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
 {
 	struct virtio_device *vdev = vq->vdev;
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -435,11 +435,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 	if (vp_dev->msix_enabled) {
 		mask = vp_dev->msix_affinity_masks[info->msix_vector];
 		irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
-		if (cpu == -1)
+		if (!cpu_mask)
 			irq_set_affinity_hint(irq, NULL);
 		else {
-			cpumask_clear(mask);
-			cpumask_set_cpu(cpu, mask);
+			cpumask_copy(mask, cpu_mask);
 			irq_set_affinity_hint(irq, mask);
 		}
 	}
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 135ee3cf7175..02271002c2f3 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -141,7 +141,7 @@ const char *vp_bus_name(struct virtio_device *vdev);
  * - OR over all affinities for shared MSI
  * - ignore the affinity request if we're using INTX
  */
-int vp_set_vq_affinity(struct virtqueue *vq, int cpu);
+int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask);
 
 const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index);
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 5559a2d31c46..32baf8e26735 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -79,7 +79,8 @@ struct virtio_config_ops {
 	u64 (*get_features)(struct virtio_device *vdev);
 	int (*finalize_features)(struct virtio_device *vdev);
 	const char *(*bus_name)(struct virtio_device *vdev);
-	int (*set_vq_affinity)(struct virtqueue *vq, int cpu);
+	int (*set_vq_affinity)(struct virtqueue *vq,
+			       const struct cpumask *cpu_mask);
 	const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev,
 			int index);
 };
@@ -236,11 +237,11 @@ const char *virtio_bus_name(struct virtio_device *vdev)
  *
  */
 static inline
-int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
+int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
 {
 	struct virtio_device *vdev = vq->vdev;
 	if (vdev->config->set_vq_affinity)
-		return vdev->config->set_vq_affinity(vq, cpu);
+		return vdev->config->set_vq_affinity(vq, cpu_mask);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b86d865cb1cae1e61527ea0b8977078bbf694328 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 10 Aug 2018 13:28:07 -0700
Subject: blkcg: Make blkg_root_lookup() work for queues in bypass mode

For legacy queues the only call of blkg_root_lookup() happens after
bypass mode has been enabled. Since blkg_lookup() returns NULL for
queues in bypass mode, modify the blkg_root_lookup() such that it
no longer depends on bypass mode. Rename the function into
blk_queue_root_blkg() as suggested by Tejun.

Suggested-by: Tejun Heo <tj@kernel.org>
Fixes: 6bad9b210a22 ("blkcg: Introduce blkg_root_lookup()")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c          |  2 +-
 include/linux/blk-cgroup.h | 15 +++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 380bc284ced1..bb109bb0a055 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -813,7 +813,7 @@ static void __blk_release_queue(struct work_struct *work)
 		blk_exit_queue(q);
 	}
 
-	WARN(blkg_root_lookup(q),
+	WARN(blk_queue_root_blkg(q),
 	     "request queue %p is being released but it has not yet been removed from the blkcg controller\n",
 	     q);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1361cfc9b878..34aec30e06c7 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -342,20 +342,14 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 }
 
 /**
- * blkg_lookup - look up blkg for the specified request queue
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
  * @q: request_queue of interest
  *
  * Lookup blkg for @q at the root level. See also blkg_lookup().
  */
-static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q)
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
 {
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-	blkg = blkg_lookup(&blkcg_root, q);
-	rcu_read_unlock();
-
-	return blkg;
+	return q->root_blkg;
 }
 
 /**
@@ -881,7 +875,8 @@ static inline bool blk_cgroup_congested(void) { return false; }
 static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
 static inline void blkcg_exit_queue(struct request_queue *q) { }
-- 
cgit v1.2.3


From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 12 Aug 2018 01:59:17 +0200
Subject: bpf: decouple btf from seq bpf fs dump and enable more maps

Commit a26ca7c982cb ("bpf: btf: Add pretty print support to
the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty
print for hash/lru_hash maps") enabled support for BTF and
dumping via BPF fs for array and hash/lru map. However, both
can be decoupled from each other such that regular BPF maps
can be supported for attaching BTF key/value information,
while not all maps necessarily need to dump via map_seq_show_elem()
callback.

The basic sanity check which is a prerequisite for all maps
is that key/value size has to match in any case, and some maps
can have extra checks via map_check_btf() callback, e.g.
probing certain types or indicating no support in general. With
that we can also enable retrieving BTF info for per-cpu map
types and lpm.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h        | 13 +++++++++----
 kernel/bpf/arraymap.c      | 26 ++++++++++++--------------
 kernel/bpf/cpumap.c        |  1 +
 kernel/bpf/devmap.c        |  1 +
 kernel/bpf/hashtab.c       | 20 +-------------------
 kernel/bpf/inode.c         |  3 ++-
 kernel/bpf/local_storage.c |  1 +
 kernel/bpf/lpm_trie.c      | 12 ++++++++++++
 kernel/bpf/sockmap.c       |  2 ++
 kernel/bpf/stackmap.c      |  1 +
 kernel/bpf/syscall.c       | 36 ++++++++++++++++++++++++++++++++----
 kernel/bpf/xskmap.c        |  3 +--
 12 files changed, 75 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index db11662faea6..523481a3471b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -23,7 +23,7 @@ struct bpf_prog;
 struct bpf_map;
 struct sock;
 struct seq_file;
-struct btf;
+struct btf_type;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -48,8 +48,9 @@ struct bpf_map_ops {
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
 				  struct seq_file *m);
-	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
-			     u32 key_type_id, u32 value_type_id);
+	int (*map_check_btf)(const struct bpf_map *map,
+			     const struct btf_type *key_type,
+			     const struct btf_type *value_type);
 };
 
 struct bpf_map {
@@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
 
 static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 {
-	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+	return map->btf && map->ops->map_seq_show_elem;
 }
 
+int map_check_no_btf(const struct bpf_map *map,
+		     const struct btf_type *key_type,
+		     const struct btf_type *value_type);
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* function argument constraints */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f6ca3e712831..0c17aab3ce5f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
-			       u32 btf_key_id, u32 btf_value_id)
+static int array_map_check_btf(const struct bpf_map *map,
+			       const struct btf_type *key_type,
+			       const struct btf_type *value_type)
 {
-	const struct btf_type *key_type, *value_type;
-	u32 key_size, value_size;
 	u32 int_data;
 
-	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
-	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 		return -EINVAL;
 
 	int_data = *(u32 *)(key_type + 1);
-	/* bpf array can only take a u32 key.  This check makes
-	 * sure that the btf matches the attr used during map_create.
+	/* bpf array can only take a u32 key. This check makes sure
+	 * that the btf matches the attr used during map_create.
 	 */
-	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
-	    BTF_INT_OFFSET(int_data))
-		return -EINVAL;
-
-	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
-	if (!value_type || value_size != map->value_size)
+	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
 		return -EINVAL;
 
 	return 0;
@@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_lookup_elem = percpu_array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 	.map_release_uref = bpf_fd_array_map_clear,
+	.map_check_btf = map_check_no_btf,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
 	.map_release = perf_event_fd_array_release,
+	.map_check_btf = map_check_no_btf,
 };
 
 #ifdef CONFIG_CGROUPS
@@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
+	.map_check_btf = map_check_no_btf,
 };
 #endif
 
@@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = array_of_map_gen_lookup,
+	.map_check_btf = map_check_no_btf,
 };
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e0918d180f08..3b494941a44a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_update_elem	= cpu_map_update_elem,
 	.map_lookup_elem	= cpu_map_lookup_elem,
 	.map_get_next_key	= cpu_map_get_next_key,
+	.map_check_btf		= map_check_no_btf,
 };
 
 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d361fc1e3bf3..a7c6620d8389 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -484,6 +484,7 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_lookup_elem = dev_map_lookup_elem,
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
+	.map_check_btf = map_check_no_btf,
 };
 
 static int dev_map_notification(struct notifier_block *notifier,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d6110042e0d9..04b8eda94e7d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1185,23 +1185,6 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf,
-			      u32 btf_key_id, u32 btf_value_id)
-{
-	const struct btf_type *key_type, *value_type;
-	u32 key_size, value_size;
-
-	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
-	if (!key_type || key_size != map->key_size)
-		return -EINVAL;
-
-	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
-	if (!value_type || value_size != map->value_size)
-		return -EINVAL;
-
-	return 0;
-}
-
 const struct bpf_map_ops htab_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
@@ -1212,7 +1195,6 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 	.map_gen_lookup = htab_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
-	.map_check_btf = htab_map_check_btf,
 };
 
 const struct bpf_map_ops htab_lru_map_ops = {
@@ -1225,7 +1207,6 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_gen_lookup = htab_lru_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
-	.map_check_btf = htab_map_check_btf,
 };
 
 /* Called from eBPF program */
@@ -1452,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = htab_of_map_gen_lookup,
+	.map_check_btf = map_check_no_btf,
 };
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fc5b103512e7..2ada5e21dfa6 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -334,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 	struct bpf_map *map = arg;
 
 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
-			     map->btf ? &bpffs_map_fops : &bpffs_obj_fops);
+			     bpf_map_support_seq_show(map) ?
+			     &bpffs_map_fops : &bpffs_obj_fops);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index fc4e37f68f2a..22ad967d1e5f 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_lookup_elem = cgroup_storage_lookup_elem,
 	.map_update_elem = cgroup_storage_update_elem,
 	.map_delete_elem = cgroup_storage_delete_elem,
+	.map_check_btf = map_check_no_btf,
 };
 
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1603492c9cc7..9058317ba9de 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -10,11 +10,13 @@
  */
 
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <net/ipv6.h>
+#include <uapi/linux/btf.h>
 
 /* Intermediate node */
 #define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -686,6 +688,15 @@ free_stack:
 	return err;
 }
 
+static int trie_check_btf(const struct bpf_map *map,
+			  const struct btf_type *key_type,
+			  const struct btf_type *value_type)
+{
+	/* Keys must have struct bpf_lpm_trie_key embedded. */
+	return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
+	       -EINVAL : 0;
+}
+
 const struct bpf_map_ops trie_map_ops = {
 	.map_alloc = trie_alloc,
 	.map_free = trie_free,
@@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = {
 	.map_lookup_elem = trie_lookup_elem,
 	.map_update_elem = trie_update_elem,
 	.map_delete_elem = trie_delete_elem,
+	.map_check_btf = trie_check_btf,
 };
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0b38be5a955c..e376ffffebce 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2495,6 +2495,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
 	.map_release_uref = sock_map_release,
+	.map_check_btf = map_check_no_btf,
 };
 
 const struct bpf_map_ops sock_hash_ops = {
@@ -2505,6 +2506,7 @@ const struct bpf_map_ops sock_hash_ops = {
 	.map_update_elem = sock_hash_update_elem,
 	.map_delete_elem = sock_hash_delete_elem,
 	.map_release_uref = sock_map_release,
+	.map_check_btf = map_check_no_btf,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b675a3f3d141..8061a439ef18 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_lookup_elem = stack_map_lookup_elem,
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
+	.map_check_btf = map_check_no_btf,
 };
 
 static int __init stack_map_init(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 57f4d076141b..43727ed0d94a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
 const struct bpf_map_ops bpf_map_offload_ops = {
 	.map_alloc = bpf_map_offload_map_alloc,
 	.map_free = bpf_map_offload_map_free,
+	.map_check_btf = map_check_no_btf,
 };
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
+int map_check_no_btf(const struct bpf_map *map,
+		     const struct btf_type *key_type,
+		     const struct btf_type *value_type)
+{
+	return -ENOTSUPP;
+}
+
+static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			 u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	int ret = 0;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || key_size != map->key_size)
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size != map->value_size)
+		return -EINVAL;
+
+	if (map->ops->map_check_btf)
+		ret = map->ops->map_check_btf(map, key_type, value_type);
+
+	return ret;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
+	if (attr->btf_key_type_id || attr->btf_value_type_id) {
 		struct btf *btf;
 
 		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
@@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
-					      attr->btf_value_type_id);
+		err = map_check_btf(map, btf, attr->btf_key_type_id,
+				    attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index b3c557476a8d..4ddf61e158f6 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_lookup_elem = xsk_map_lookup_elem,
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
+	.map_check_btf = map_check_no_btf,
 };
-
-
-- 
cgit v1.2.3


From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sun, 12 Aug 2018 10:49:27 -0700
Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper

== Problem description ==

It's useful to be able to identify cgroup associated with skb in TC so
that a policy can be applied to this skb, and existing bpf_skb_cgroup_id
helper can help with this.

Though in real life cgroup hierarchy and hierarchy to apply a policy to
don't map 1:1.

It's often the case that there is a container and corresponding cgroup,
but there are many more sub-cgroups inside container, e.g. because it's
delegated to containerized application to control resources for its
subsystems, or to separate application inside container from infra that
belongs to containerization system (e.g. sshd).

At the same time it may be useful to apply a policy to container as a
whole.

If multiple containers like this are run on a host (what is often the
case) and many of them have sub-cgroups, it may not be possible to apply
per-container policy in TC with existing helpers such as
bpf_skb_under_cgroup or bpf_skb_cgroup_id:

* bpf_skb_cgroup_id will return id of immediate cgroup associated with
  skb, i.e. if it's a sub-cgroup inside container, it can't be used to
  identify container's cgroup;

* bpf_skb_under_cgroup can work only with one cgroup and doesn't scale,
  i.e. if there are N containers on a host and a policy has to be
  applied to M of them (0 <= M <= N), it'd require M calls to
  bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild &
  load new BPF program.

== Solution ==

The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be
used to get id of cgroup v2 that is an ancestor of cgroup associated
with skb at specified level of cgroup hierarchy.

That way admin can place all containers on one level of cgroup hierarchy
(what is a good practice in general and already used in many
configurations) and identify specific cgroup on this level no matter
what sub-cgroup skb is associated with.

E.g. if there is a cgroup hierarchy:
  root/
  root/container1/
  root/container1/app11/
  root/container1/app11/sub-app-a/
  root/container1/app12/
  root/container2/
  root/container2/app21/
  root/container2/app22/
  root/container2/app22/sub-app-b/

, then having skb associated with root/container1/app11/sub-app-a/ it's
possible to get ancestor at level 1, what is container1 and apply policy
for this container, or apply another policy if it's container2.

Policies can be kept e.g. in a hash map where key is a container cgroup
id and value is an action.

Levels where container cgroups are created are usually known in advance
whether cgroup hierarchy inside container may be hard to predict
especially in case when its creation is delegated to containerized
application.

== Implementation details ==

The helper gets ancestor by walking parents up to specified level.

Another option would be to get different kind of "id" from
cgroup->ancestor_ids[level] and use it with idr_find() to get struct
cgroup for ancestor. But that would require radix lookup what doesn't
seem to be better (at least it's not obviously better).

Format of return value of the new helper is same as that of
bpf_skb_cgroup_id.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/cgroup.h   | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c9fdf6f57913..32c553556bbd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
 	return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * cgroup_ancestor - find ancestor of cgroup
+ * @cgrp: cgroup to find ancestor of
+ * @ancestor_level: level of ancestor to find starting from root
+ *
+ * Find ancestor of cgroup at specified level starting from root if it exists
+ * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
+ * @ancestor_level.
+ *
+ * This function is safe to call as long as @cgrp is accessible.
+ */
+static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
+					     int ancestor_level)
+{
+	struct cgroup *ptr;
+
+	if (cgrp->level < ancestor_level)
+		return NULL;
+
+	for (ptr = cgrp;
+	     ptr && ptr->level > ancestor_level;
+	     ptr = cgroup_parent(ptr))
+		;
+
+	if (ptr && ptr->level == ancestor_level)
+		return ptr;
+
+	return NULL;
+}
+
 /**
  * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
  * @task: the task to be tested
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3102a2a23c31..66917a4eba27 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2093,6 +2093,24 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
  * u64 bpf_get_current_cgroup_id(void)
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
@@ -2207,7 +2225,8 @@ union bpf_attr {
 	FN(skb_cgroup_id),		\
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
-	FN(sk_select_reuseport),
+	FN(sk_select_reuseport),	\
+	FN(skb_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 22906b31d43f..15b9d2df92ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3778,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+	   ancestor_level)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *ancestor;
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	ancestor = cgroup_ancestor(cgrp, ancestor_level);
+	if (!ancestor)
+		return 0;
+
+	return ancestor->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
+	.func           = bpf_skb_ancestor_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+};
 #endif
 
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -4966,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_SOCK_CGROUP_DATA
 	case BPF_FUNC_skb_cgroup_id:
 		return &bpf_skb_cgroup_id_proto;
+	case BPF_FUNC_skb_ancestor_cgroup_id:
+		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
-- 
cgit v1.2.3


From 9af18e56d43ca4864ce65c3542c513827c2697de Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 12 Aug 2018 09:14:03 -0400
Subject: cpumask: make cpumask_next_wrap available without smp

The kbuild robot shows build failure on machines without CONFIG_SMP:

  drivers/net/virtio_net.c:1916:10: error:
    implicit declaration of function 'cpumask_next_wrap'

cpumask_next_wrap is exported from lib/cpumask.o, which has

    lib-$(CONFIG_SMP) += cpumask.o

same as other functions, also define it as static inline in the
NR_CPUS==1 branch in include/linux/cpumask.h.

If wrap is true and next == start, return nr_cpumask_bits, or 1.
Else wrap across the range of valid cpus, here [0].

Fixes: 2ca653d607ce ("virtio_net: Stripe queue affinities across cores.")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Tested-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 57f20a0a7794..147bdec42215 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -159,6 +159,13 @@ static inline unsigned int cpumask_next_and(int n,
 	return n+1;
 }
 
+static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
+					     int start, bool wrap)
+{
+	/* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
+	return (wrap && n == 0);
+}
+
 /* cpu must be a valid cpu, ie 0, so there's no other choice. */
 static inline unsigned int cpumask_any_but(const struct cpumask *mask,
 					   unsigned int cpu)
-- 
cgit v1.2.3


From cf916ffbe0c628bb072ea829f300cff1874162ce Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Sun, 29 Apr 2018 09:17:34 -0500
Subject: net/mlx5: Improve argument name for add flow API

The last argument to mlx5_add_flow_rules passes the number of
destinations in the struct pointed to by the dest arg. Change the name
to better reflect this fact.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 8 ++++----
 include/linux/mlx5/fs.h                           | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index a21df24b695e..261cb6aacf12 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1876,7 +1876,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		    struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
-		    int dest_num)
+		    int num_dest)
 {
 	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
 	struct mlx5_flow_destination gen_dest = {};
@@ -1889,7 +1889,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 	if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		if (!fwd_next_prio_supported(ft))
 			return ERR_PTR(-EOPNOTSUPP);
-		if (dest_num)
+		if (num_dest)
 			return ERR_PTR(-EINVAL);
 		mutex_lock(&root->chain_lock);
 		next_ft = find_next_chained_ft(prio);
@@ -1897,7 +1897,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 			gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 			gen_dest.ft = next_ft;
 			dest = &gen_dest;
-			dest_num = 1;
+			num_dest = 1;
 			flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 		} else {
 			mutex_unlock(&root->chain_lock);
@@ -1905,7 +1905,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		}
 	}
 
-	handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, dest_num);
+	handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest);
 
 	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		if (!IS_ERR_OR_NULL(handle) &&
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index c40f2fc68655..71fb503b2b52 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -177,7 +177,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		    struct mlx5_flow_spec *spec,
 		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
-		    int dest_num);
+		    int num_dest);
 void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
 
 int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
-- 
cgit v1.2.3


From b72ae8cac0caff86fe414fceb940655c2d1371c9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 29 Jul 2018 16:16:56 +0300
Subject: PCI: Add PCI_DEVICE_DATA() macro to fully describe device ID entry

There are a lot of examples in the kernel where PCI_VDEVICE() is used and
still looks not so convenient due to additional driver_data field attached.

Introduce PCI_DEVICE_DATA() macro to fully describe device ID entry in
shortest possible form. For example,

  before:

    { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_MRFLD),
      (kernel_ulong_t) &dwc3_pci_mrfld_properties, },

  after:

    { PCI_DEVICE_DATA(INTEL, MRFLD, &dwc3_pci_mrfld_properties) },

Drivers can be converted later on in independent way.

While here, remove the unused macro with the same name from Ralink wireless
driver.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Kalle Valo <kvalo@codeaurora.org>	# for rt2x00
---
 drivers/net/wireless/ralink/rt2x00/rt2x00pci.h |  6 ------
 include/linux/pci.h                            | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00pci.h b/drivers/net/wireless/ralink/rt2x00/rt2x00pci.h
index bc0ca5f58f38..283e2e607bba 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00pci.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00pci.h
@@ -27,12 +27,6 @@
 #include <linux/io.h>
 #include <linux/pci.h>
 
-/*
- * This variable should be used with the
- * pci_driver structure initialization.
- */
-#define PCI_DEVICE_DATA(__ops)	.driver_data = (kernel_ulong_t)(__ops)
-
 /*
  * PCI driver handlers.
  */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 340029b2fb38..bf3665c534c2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -819,6 +819,21 @@ struct pci_driver {
 	.vendor = PCI_VENDOR_ID_##vend, .device = (dev), \
 	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0
 
+/**
+ * PCI_DEVICE_DATA - macro used to describe a specific PCI device in very short form
+ * @vend: the vendor name (without PCI_VENDOR_ID_ prefix)
+ * @dev: the device name (without PCI_DEVICE_ID_<vend>_ prefix)
+ * @data: the driver data to be filled
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific PCI device.  The subvendor, and subdevice fields will be set
+ * to PCI_ANY_ID.
+ */
+#define PCI_DEVICE_DATA(vend, dev, data) \
+	.vendor = PCI_VENDOR_ID_##vend, .device = PCI_DEVICE_ID_##vend##_##dev, \
+	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0, \
+	.driver_data = (kernel_ulong_t)(data)
+
 enum {
 	PCI_REASSIGN_ALL_RSRC	= 0x00000001,	/* Ignore firmware setup */
 	PCI_REASSIGN_ALL_BUS	= 0x00000002,	/* Reassign all bus numbers */
-- 
cgit v1.2.3


From 2538fb89b8f4ee9e1c1759cfb3c7989d9ef1f6e7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 14 Aug 2018 16:48:50 -0700
Subject: PCI: Limit config space size for Netronome NFP5000

Like the NFP4000 and NFP6000, the NFP5000 as an erratum where reading/
writing to PCI config space addresses above 0x600 can cause the NFP to
generate PCIe completion timeouts.

Limit the NFP5000's PF's config space size to 0x600 bytes as is already
done for the NFP4000 and NFP6000.

The NFP5000's VF is 0x6003 (PCI_DEVICE_ID_NETRONOME_NFP6000_VF), the same
device ID as the NFP6000's VF.  Thus, its config space is already limited
by the existing use of quirk_nfp6000().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Tony Egan <tony.egan@netronome.com>
---
 drivers/pci/quirks.c    | 1 +
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 502275c092ae..7743cd56b89a 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -460,6 +460,7 @@ static void quirk_nfp6000(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME,	PCI_DEVICE_ID_NETRONOME_NFP4000,	quirk_nfp6000);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME,	PCI_DEVICE_ID_NETRONOME_NFP6000,	quirk_nfp6000);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME,	PCI_DEVICE_ID_NETRONOME_NFP5000,	quirk_nfp6000);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NETRONOME,	PCI_DEVICE_ID_NETRONOME_NFP6000_VF,	quirk_nfp6000);
 
 /*  On IBM Crocodile ipr SAS adapters, expand BAR to system page size */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 29502238e510..381fc009551d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,6 +2541,7 @@
 #define PCI_DEVICE_ID_NETRONOME_NFP3200	0x3200
 #define PCI_DEVICE_ID_NETRONOME_NFP3240	0x3240
 #define PCI_DEVICE_ID_NETRONOME_NFP4000	0x4000
+#define PCI_DEVICE_ID_NETRONOME_NFP5000	0x5000
 #define PCI_DEVICE_ID_NETRONOME_NFP6000	0x6000
 #define PCI_DEVICE_ID_NETRONOME_NFP6000_VF	0x6003
 
-- 
cgit v1.2.3


From 817aef260037f33ee0f44c17fe341323d3aebd6d Mon Sep 17 00:00:00 2001
From: Yannik Sembritzki <yannik@sembritzki.me>
Date: Thu, 16 Aug 2018 14:05:10 +0100
Subject: Replace magic for trusting the secondary keyring with #define

Replace the use of a magic number that indicates that verify_*_signature()
should use the secondary keyring with a symbol.

Signed-off-by: Yannik Sembritzki <yannik@sembritzki.me>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: keyrings@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 certs/system_keyring.c                  | 3 ++-
 crypto/asymmetric_keys/pkcs7_key_type.c | 2 +-
 include/linux/verification.h            | 6 ++++++
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 6251d1b27f0c..81728717523d 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -15,6 +15,7 @@
 #include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/verification.h>
 #include <keys/asymmetric-type.h>
 #include <keys/system_keyring.h>
 #include <crypto/pkcs7.h>
@@ -230,7 +231,7 @@ int verify_pkcs7_signature(const void *data, size_t len,
 
 	if (!trusted_keys) {
 		trusted_keys = builtin_trusted_keys;
-	} else if (trusted_keys == (void *)1UL) {
+	} else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) {
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
 		trusted_keys = secondary_trusted_keys;
 #else
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index e284d9cb9237..5b2f6a2b5585 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -63,7 +63,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	return verify_pkcs7_signature(NULL, 0,
 				      prep->data, prep->datalen,
-				      (void *)1UL, usage,
+				      VERIFY_USE_SECONDARY_KEYRING, usage,
 				      pkcs7_view_content, prep);
 }
 
diff --git a/include/linux/verification.h b/include/linux/verification.h
index a10549a6c7cd..cfa4730d607a 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -12,6 +12,12 @@
 #ifndef _LINUX_VERIFICATION_H
 #define _LINUX_VERIFICATION_H
 
+/*
+ * Indicate that both builtin trusted keys and secondary trusted keys
+ * should be used.
+ */
+#define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL)
+
 /*
  * The use to which an asymmetric key is being put.
  */
-- 
cgit v1.2.3