From b5a8d233a588b3acf2a7a3a8da30f8f68f376626 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Mon, 4 Jan 2021 17:14:50 +0100
Subject: bus: mhi: core: Add device hardware reset support

The MHI specification allows to perform a hard reset of the device
when writing to the SOC_RESET register. It can be used to completely
restart the device (e.g. in case of unrecoverable MHI error).

This is up to the MHI controller driver to determine when this hard
reset should be used, and in case of MHI errors, should be used as
a reset of last resort (after standard MHI stack reset).

This function is a stateless function, the MHI layer do nothing except
triggering the reset by writing into the right register(s), this is up
to the caller to ensure right mhi_controller state (e.g. unregister the
controller if necessary).

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 562862ff819c..54afcae1709a 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -347,6 +347,7 @@ struct mhi_controller_config {
  * @unmap_single: CB function to destroy TRE buffer
  * @read_reg: Read a MHI register via the physical link (required)
  * @write_reg: Write a MHI register via the physical link (required)
+ * @reset: Controller specific reset function (optional)
  * @buffer_len: Bounce buffer length
  * @index: Index of the MHI controller instance
  * @bounce_buf: Use of bounce buffer
@@ -437,6 +438,7 @@ struct mhi_controller {
 			u32 *out);
 	void (*write_reg)(struct mhi_controller *mhi_cntrl, void __iomem *addr,
 			  u32 val);
+	void (*reset)(struct mhi_controller *mhi_cntrl);
 
 	size_t buffer_len;
 	int index;
@@ -672,6 +674,13 @@ enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl);
  */
 enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl);
 
+/**
+ * mhi_soc_reset - Trigger a device reset. This can be used as a last resort
+ *		   to reset and recover a device.
+ * @mhi_cntrl: MHI controller
+ */
+void mhi_soc_reset(struct mhi_controller *mhi_cntrl);
+
 /**
  * mhi_device_get - Disable device low power mode
  * @mhi_dev: Device associated with the channel
-- 
cgit v1.2.3


From 14e43bf435612639cab01541fce7cc41bf7e370b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 22 Sep 2020 09:44:18 -0700
Subject: vfs: don't unnecessarily clone write access for writable fds

There's no need for mnt_want_write_file() to increment mnt_writers when
the file is already open for writing, provided that
mnt_drop_write_file() is changed to conditionally decrement it.

We seem to have ended up in the current situation because
mnt_want_write_file() used to be paired with mnt_drop_write(), due to
mnt_drop_write_file() not having been added yet.  So originally
mnt_want_write_file() had to always increment mnt_writers.

But later mnt_drop_write_file() was added, and all callers of
mnt_want_write_file() were paired with it.  This makes the compatibility
between mnt_want_write_file() and mnt_drop_write() no longer necessary.

Therefore, make __mnt_want_write_file() and __mnt_drop_write_file() skip
incrementing mnt_writers on files already open for writing.  This
removes the only caller of mnt_clone_write(), so remove that too.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  7 +++++
 fs/namespace.c                        | 53 +++++++++++++----------------------
 include/linux/mount.h                 |  1 -
 3 files changed, 27 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..6a6d3e673b48 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -865,3 +865,10 @@ no matter what.  Everything is handled by the caller.
 
 clone_private_mount() returns a longterm mount now, so the proper destructor of
 its result is kern_unmount() or kern_unmount_array().
+
+---
+
+**mandatory**
+
+mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
+whereas previously it could be paired with mnt_drop_write() as well.
diff --git a/fs/namespace.c b/fs/namespace.c
index d2db7dfe232b..9f2d94e0f3e0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -359,51 +359,37 @@ int mnt_want_write(struct vfsmount *m)
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-/**
- * mnt_clone_write - get write access to a mount
- * @mnt: the mount on which to take a write
- *
- * This is effectively like mnt_want_write, except
- * it must only be used to take an extra write reference
- * on a mountpoint that we already know has a write reference
- * on it. This allows some optimisation.
- *
- * After finished, mnt_drop_write must be called as usual to
- * drop the reference.
- */
-int mnt_clone_write(struct vfsmount *mnt)
-{
-	/* superblock may be r/o */
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
-	preempt_disable();
-	mnt_inc_writers(real_mount(mnt));
-	preempt_enable();
-	return 0;
-}
-EXPORT_SYMBOL_GPL(mnt_clone_write);
-
 /**
  * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like __mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the check for emergency r/o remounts.  This must be
+ * paired with __mnt_drop_write_file.
  */
 int __mnt_want_write_file(struct file *file)
 {
-	if (!(file->f_mode & FMODE_WRITER))
-		return __mnt_want_write(file->f_path.mnt);
-	else
-		return mnt_clone_write(file->f_path.mnt);
+	if (file->f_mode & FMODE_WRITER) {
+		/*
+		 * Superblock may have become readonly while there are still
+		 * writable fd's, e.g. due to a fs error with errors=remount-ro
+		 */
+		if (__mnt_is_readonly(file->f_path.mnt))
+			return -EROFS;
+		return 0;
+	}
+	return __mnt_want_write(file->f_path.mnt);
 }
 
 /**
  * mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the freeze protection and the check for emergency r/o
+ * remounts.  This must be paired with mnt_drop_write_file.
  */
 int mnt_want_write_file(struct file *file)
 {
@@ -449,7 +435,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write);
 
 void __mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write(file->f_path.mnt);
+	if (!(file->f_mode & FMODE_WRITER))
+		__mnt_drop_write(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aaf343b38671..b43191fe6af7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -79,7 +79,6 @@ struct path;
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
-extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mnt_drop_write_file(struct file *file);
 extern void mntput(struct vfsmount *mnt);
-- 
cgit v1.2.3


From 4ec908d2104026c67e4eb0fee722ed6893622763 Mon Sep 17 00:00:00 2001
From: Jun Nie <jun.nie@linaro.org>
Date: Fri, 4 Dec 2020 15:53:44 +0800
Subject: dt-bindings: interconnect: Add Qualcomm MSM8939 DT bindings

The Qualcomm MSM8939 platform has several bus fabrics that could be
controlled and tuned dynamically according to the bandwidth demand.

Signed-off-by: Jun Nie <jun.nie@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201204075345.5161-5-jun.nie@linaro.org
Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
---
 .../devicetree/bindings/interconnect/qcom,rpm.yaml |   4 +
 include/dt-bindings/interconnect/qcom,msm8939.h    | 105 +++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 include/dt-bindings/interconnect/qcom,msm8939.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/interconnect/qcom,rpm.yaml b/Documentation/devicetree/bindings/interconnect/qcom,rpm.yaml
index d720b68b3ead..983d71fb5399 100644
--- a/Documentation/devicetree/bindings/interconnect/qcom,rpm.yaml
+++ b/Documentation/devicetree/bindings/interconnect/qcom,rpm.yaml
@@ -23,6 +23,10 @@ properties:
       - qcom,msm8916-bimc
       - qcom,msm8916-pcnoc
       - qcom,msm8916-snoc
+      - qcom,msm8939-bimc
+      - qcom,msm8939-pcnoc
+      - qcom,msm8939-snoc
+      - qcom,msm8939-snoc-mm
       - qcom,qcs404-bimc
       - qcom,qcs404-pcnoc
       - qcom,qcs404-snoc
diff --git a/include/dt-bindings/interconnect/qcom,msm8939.h b/include/dt-bindings/interconnect/qcom,msm8939.h
new file mode 100644
index 000000000000..c22369a4b9f5
--- /dev/null
+++ b/include/dt-bindings/interconnect/qcom,msm8939.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Qualcomm interconnect IDs
+ *
+ * Copyright (c) 2020, Linaro Ltd.
+ * Author: Jun Nie <jun.nie@linaro.org>
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_MSM8939_H
+#define __DT_BINDINGS_INTERCONNECT_QCOM_MSM8939_H
+
+#define BIMC_SNOC_SLV			0
+#define MASTER_QDSS_BAM			1
+#define MASTER_QDSS_ETR			2
+#define MASTER_SNOC_CFG			3
+#define PCNOC_SNOC_SLV			4
+#define SLAVE_APSS			5
+#define SLAVE_CATS_128			6
+#define SLAVE_OCMEM_64			7
+#define SLAVE_IMEM			8
+#define SLAVE_QDSS_STM			9
+#define SLAVE_SRVC_SNOC			10
+#define SNOC_BIMC_0_MAS			11
+#define SNOC_BIMC_1_MAS			12
+#define SNOC_BIMC_2_MAS			13
+#define SNOC_INT_0			14
+#define SNOC_INT_1			15
+#define SNOC_INT_BIMC			16
+#define SNOC_PCNOC_MAS			17
+#define SNOC_QDSS_INT			18
+
+#define MASTER_VIDEO_P0			0
+#define MASTER_JPEG			1
+#define MASTER_VFE			2
+#define MASTER_MDP_PORT0		3
+#define MASTER_MDP_PORT1		4
+#define MASTER_CPP			5
+#define SNOC_MM_INT_0			6
+#define SNOC_MM_INT_1			7
+#define SNOC_MM_INT_2			8
+
+#define BIMC_SNOC_MAS			0
+#define MASTER_AMPSS_M0			1
+#define MASTER_GRAPHICS_3D		2
+#define MASTER_TCU0			3
+#define SLAVE_AMPSS_L2			4
+#define SLAVE_EBI_CH0			5
+#define SNOC_BIMC_0_SLV			6
+#define SNOC_BIMC_1_SLV			7
+#define SNOC_BIMC_2_SLV			8
+
+#define MASTER_BLSP_1			0
+#define MASTER_DEHR			1
+#define MASTER_LPASS			2
+#define MASTER_CRYPTO_CORE0		3
+#define MASTER_SDCC_1			4
+#define MASTER_SDCC_2			5
+#define MASTER_SPDM			6
+#define MASTER_USB_HS1			7
+#define MASTER_USB_HS2			8
+#define PCNOC_INT_0			9
+#define PCNOC_INT_1			10
+#define PCNOC_MAS_0			11
+#define PCNOC_MAS_1			12
+#define PCNOC_SLV_0			13
+#define PCNOC_SLV_1			14
+#define PCNOC_SLV_2			15
+#define PCNOC_SLV_3			16
+#define PCNOC_SLV_4			17
+#define PCNOC_SLV_8			18
+#define PCNOC_SLV_9			19
+#define PCNOC_SNOC_MAS			20
+#define SLAVE_BIMC_CFG			21
+#define SLAVE_BLSP_1			22
+#define SLAVE_BOOT_ROM			23
+#define SLAVE_CAMERA_CFG		24
+#define SLAVE_CLK_CTL			25
+#define SLAVE_CRYPTO_0_CFG			26
+#define SLAVE_DEHR_CFG			27
+#define SLAVE_DISPLAY_CFG			28
+#define SLAVE_GRAPHICS_3D_CFG			29
+#define SLAVE_IMEM_CFG			30
+#define SLAVE_LPASS			31
+#define SLAVE_MPM			32
+#define SLAVE_MSG_RAM			33
+#define SLAVE_MSS			34
+#define SLAVE_PDM			35
+#define SLAVE_PMIC_ARB			36
+#define SLAVE_PCNOC_CFG			37
+#define SLAVE_PRNG			38
+#define SLAVE_QDSS_CFG			39
+#define SLAVE_RBCPR_CFG			40
+#define SLAVE_SDCC_1			41
+#define SLAVE_SDCC_2			42
+#define SLAVE_SECURITY			43
+#define SLAVE_SNOC_CFG			44
+#define SLAVE_SPDM			45
+#define SLAVE_TCSR			46
+#define SLAVE_TLMM			47
+#define SLAVE_USB_HS1			48
+#define SLAVE_USB_HS2			49
+#define SLAVE_VENUS_CFG			50
+#define SNOC_PCNOC_SLV			51
+
+#endif
-- 
cgit v1.2.3


From 8527efc59d45d7135460daac36054f2f213ac08a Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 5 Jan 2021 19:59:05 -0800
Subject: rpmsg: glink: Guard qcom_glink_ssr_notify() with correct config

The qcom_glink_ssr_notify() function doesn't relate to the SMEM specific
GLINK config, but the common RPMSG_QCOM_GLINK config. Update the guard
to properly reflect this.

Reviewed-by: Alex Elder <elder@linaro.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210106035905.4153692-1-bjorn.andersson@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg/qcom_glink.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h
index daded9fddf36..22fc3a69b683 100644
--- a/include/linux/rpmsg/qcom_glink.h
+++ b/include/linux/rpmsg/qcom_glink.h
@@ -7,12 +7,17 @@
 
 struct qcom_glink;
 
+#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK)
+void qcom_glink_ssr_notify(const char *ssr_name);
+#else
+static inline void qcom_glink_ssr_notify(const char *ssr_name) {}
+#endif
+
 #if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SMEM)
 
 struct qcom_glink *qcom_glink_smem_register(struct device *parent,
 					    struct device_node *node);
 void qcom_glink_smem_unregister(struct qcom_glink *glink);
-void qcom_glink_ssr_notify(const char *ssr_name);
 
 #else
 
@@ -24,7 +29,6 @@ qcom_glink_smem_register(struct device *parent,
 }
 
 static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {}
-static inline void qcom_glink_ssr_notify(const char *ssr_name) {}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 9326eecd9365a5b82220a4011592f7c0209566fc Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 6 Jan 2021 20:37:10 -0800
Subject: fpga: dfl: move dfl_device_id to mod_devicetable.h

In order to support MODULE_DEVICE_TABLE() for dfl device driver, this
patch moves struct dfl_device_id to mod_devicetable.h

Some brief description for DFL (Device Feature List) is added to make
the DFL known to the whole kernel.

Reviewed-by: Tom Rix <trix@redhat.com>
Acked-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20210107043714.991646-5-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/dfl.h              | 13 +------------
 include/linux/mod_devicetable.h | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index ac373b1fcff9..549c7900dcfd 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/iopoll.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/uuid.h>
@@ -525,18 +526,6 @@ enum dfl_id_type {
 	DFL_ID_MAX,
 };
 
-/**
- * struct dfl_device_id -  dfl device identifier
- * @type: DFL FIU type of the device. See enum dfl_id_type.
- * @feature_id: feature identifier local to its DFL FIU type.
- * @driver_data: driver specific data.
- */
-struct dfl_device_id {
-	u16 type;
-	u16 feature_id;
-	unsigned long driver_data;
-};
-
 /**
  * struct dfl_device - represent an dfl device on dfl bus
  *
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c425290b21e2..b8dae34eca10 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -846,4 +846,28 @@ struct auxiliary_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/*
+ * DFL (Device Feature List)
+ *
+ * DFL defines a linked list of feature headers within the device MMIO space to
+ * provide an extensible way of adding features. Software can walk through these
+ * predefined data structures to enumerate features. It is now used in the FPGA.
+ * See Documentation/fpga/dfl.rst for more information.
+ *
+ * The dfl bus type is introduced to match the individual feature devices (dfl
+ * devices) for specific dfl drivers.
+ */
+
+/**
+ * struct dfl_device_id -  dfl device identifier
+ * @type: DFL FIU type of the device. See enum dfl_id_type.
+ * @feature_id: feature identifier local to its DFL FIU type.
+ * @driver_data: driver specific data.
+ */
+struct dfl_device_id {
+	__u16 type;
+	__u16 feature_id;
+	kernel_ulong_t driver_data;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From ecc1641aca658ed4140751748f84985ffb6cce28 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 6 Jan 2021 20:37:12 -0800
Subject: fpga: dfl: move dfl bus related APIs to include/linux/dfl.h

Now the dfl drivers could be made as independent modules and put in
different folders according to their functionalities. In order for
scattered dfl device drivers to include dfl bus APIs, move the
dfl bus APIs to a new header file in the public folder.

[mdf@kernel.org: Fixed up header guards to match filename]

Reviewed-by: Tom Rix <trix@redhat.com>
Acked-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20210107043714.991646-7-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS         |  1 +
 drivers/fpga/dfl.c  |  1 +
 drivers/fpga/dfl.h  | 72 --------------------------------------------
 include/linux/dfl.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/dfl.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 546aa66428c9..a044b58eb9ac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6957,6 +6957,7 @@ S:	Maintained
 F:	Documentation/ABI/testing/sysfs-bus-dfl
 F:	Documentation/fpga/dfl.rst
 F:	drivers/fpga/dfl*
+F:	include/linux/dfl.h
 F:	include/uapi/linux/fpga-dfl.h
 
 FPGA MANAGER FRAMEWORK
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 5a6ba3b2fa05..511b20ff35a3 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -10,6 +10,7 @@
  *   Wu Hao <hao.wu@intel.com>
  *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
  */
+#include <linux/dfl.h>
 #include <linux/fpga-dfl.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 549c7900dcfd..2b82c96ba56c 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -517,76 +517,4 @@ long dfl_feature_ioctl_set_irq(struct platform_device *pdev,
 			       struct dfl_feature *feature,
 			       unsigned long arg);
 
-/**
- * enum dfl_id_type - define the DFL FIU types
- */
-enum dfl_id_type {
-	FME_ID = 0,
-	PORT_ID = 1,
-	DFL_ID_MAX,
-};
-
-/**
- * struct dfl_device - represent an dfl device on dfl bus
- *
- * @dev: generic device interface.
- * @id: id of the dfl device.
- * @type: type of DFL FIU of the device. See enum dfl_id_type.
- * @feature_id: feature identifier local to its DFL FIU type.
- * @mmio_res: mmio resource of this dfl device.
- * @irqs: list of Linux IRQ numbers of this dfl device.
- * @num_irqs: number of IRQs supported by this dfl device.
- * @cdev: pointer to DFL FPGA container device this dfl device belongs to.
- * @id_entry: matched id entry in dfl driver's id table.
- */
-struct dfl_device {
-	struct device dev;
-	int id;
-	u16 type;
-	u16 feature_id;
-	struct resource mmio_res;
-	int *irqs;
-	unsigned int num_irqs;
-	struct dfl_fpga_cdev *cdev;
-	const struct dfl_device_id *id_entry;
-};
-
-/**
- * struct dfl_driver - represent an dfl device driver
- *
- * @drv: driver model structure.
- * @id_table: pointer to table of device IDs the driver is interested in.
- *	      { } member terminated.
- * @probe: mandatory callback for device binding.
- * @remove: callback for device unbinding.
- */
-struct dfl_driver {
-	struct device_driver drv;
-	const struct dfl_device_id *id_table;
-
-	int (*probe)(struct dfl_device *dfl_dev);
-	void (*remove)(struct dfl_device *dfl_dev);
-};
-
-#define to_dfl_dev(d) container_of(d, struct dfl_device, dev)
-#define to_dfl_drv(d) container_of(d, struct dfl_driver, drv)
-
-/*
- * use a macro to avoid include chaining to get THIS_MODULE.
- */
-#define dfl_driver_register(drv) \
-	__dfl_driver_register(drv, THIS_MODULE)
-int __dfl_driver_register(struct dfl_driver *dfl_drv, struct module *owner);
-void dfl_driver_unregister(struct dfl_driver *dfl_drv);
-
-/*
- * module_dfl_driver() - Helper macro for drivers that don't do
- * anything special in module init/exit.  This eliminates a lot of
- * boilerplate.  Each module may only use this macro once, and
- * calling it replaces module_init() and module_exit().
- */
-#define module_dfl_driver(__dfl_driver) \
-	module_driver(__dfl_driver, dfl_driver_register, \
-		      dfl_driver_unregister)
-
 #endif /* __FPGA_DFL_H */
diff --git a/include/linux/dfl.h b/include/linux/dfl.h
new file mode 100644
index 000000000000..6cc10982351a
--- /dev/null
+++ b/include/linux/dfl.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for DFL driver and device API
+ *
+ * Copyright (C) 2020 Intel Corporation, Inc.
+ */
+
+#ifndef __LINUX_DFL_H
+#define __LINUX_DFL_H
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+
+/**
+ * enum dfl_id_type - define the DFL FIU types
+ */
+enum dfl_id_type {
+	FME_ID = 0,
+	PORT_ID = 1,
+	DFL_ID_MAX,
+};
+
+/**
+ * struct dfl_device - represent an dfl device on dfl bus
+ *
+ * @dev: generic device interface.
+ * @id: id of the dfl device.
+ * @type: type of DFL FIU of the device. See enum dfl_id_type.
+ * @feature_id: feature identifier local to its DFL FIU type.
+ * @mmio_res: mmio resource of this dfl device.
+ * @irqs: list of Linux IRQ numbers of this dfl device.
+ * @num_irqs: number of IRQs supported by this dfl device.
+ * @cdev: pointer to DFL FPGA container device this dfl device belongs to.
+ * @id_entry: matched id entry in dfl driver's id table.
+ */
+struct dfl_device {
+	struct device dev;
+	int id;
+	u16 type;
+	u16 feature_id;
+	struct resource mmio_res;
+	int *irqs;
+	unsigned int num_irqs;
+	struct dfl_fpga_cdev *cdev;
+	const struct dfl_device_id *id_entry;
+};
+
+/**
+ * struct dfl_driver - represent an dfl device driver
+ *
+ * @drv: driver model structure.
+ * @id_table: pointer to table of device IDs the driver is interested in.
+ *	      { } member terminated.
+ * @probe: mandatory callback for device binding.
+ * @remove: callback for device unbinding.
+ */
+struct dfl_driver {
+	struct device_driver drv;
+	const struct dfl_device_id *id_table;
+
+	int (*probe)(struct dfl_device *dfl_dev);
+	void (*remove)(struct dfl_device *dfl_dev);
+};
+
+#define to_dfl_dev(d) container_of(d, struct dfl_device, dev)
+#define to_dfl_drv(d) container_of(d, struct dfl_driver, drv)
+
+/*
+ * use a macro to avoid include chaining to get THIS_MODULE.
+ */
+#define dfl_driver_register(drv) \
+	__dfl_driver_register(drv, THIS_MODULE)
+int __dfl_driver_register(struct dfl_driver *dfl_drv, struct module *owner);
+void dfl_driver_unregister(struct dfl_driver *dfl_drv);
+
+/*
+ * module_dfl_driver() - Helper macro for drivers that don't do
+ * anything special in module init/exit.  This eliminates a lot of
+ * boilerplate.  Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dfl_driver(__dfl_driver) \
+	module_driver(__dfl_driver, dfl_driver_register, \
+		      dfl_driver_unregister)
+
+#endif /* __LINUX_DFL_H */
-- 
cgit v1.2.3


From 4b9bbb29baf6c948d24eaeff892815b8a403b64c Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Thu, 17 Dec 2020 19:17:00 -0800
Subject: driver core: Add device link support for INFERRED flag

This flag can never be added to a device link that already exists and
doesn't have the flag set. It can only be added when a device link is
created for the first time or it can be maintained if the device link
already has the it set.

This flag will be used for marking device links created ONLY by
inferring dependencies from data and NOT from explicit action by device
drivers/frameworks.  This will be useful in the future when we need to
deal with cycles in dependencies inferred from firmware.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201218031703.3053753-3-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 15 +++++++++++----
 include/linux/device.h |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fe8601197b84..5827dbff7f21 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -229,7 +229,8 @@ int device_is_dependent(struct device *dev, void *target)
 		return ret;
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (link->flags == (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
+		if ((link->flags & ~DL_FLAG_INFERRED) ==
+		    (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
 			continue;
 
 		if (link->consumer == target)
@@ -302,7 +303,8 @@ static int device_reorder_to_tail(struct device *dev, void *not_used)
 
 	device_for_each_child(dev, NULL, device_reorder_to_tail);
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (link->flags == (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
+		if ((link->flags & ~DL_FLAG_INFERRED) ==
+		    (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
 			continue;
 		device_reorder_to_tail(link->consumer, NULL);
 	}
@@ -546,7 +548,8 @@ postcore_initcall(devlink_class_init);
 #define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
 			       DL_FLAG_AUTOREMOVE_SUPPLIER | \
 			       DL_FLAG_AUTOPROBE_CONSUMER  | \
-			       DL_FLAG_SYNC_STATE_ONLY)
+			       DL_FLAG_SYNC_STATE_ONLY | \
+			       DL_FLAG_INFERRED)
 
 #define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
 			    DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)
@@ -615,7 +618,7 @@ struct device_link *device_link_add(struct device *consumer,
 	if (!consumer || !supplier || flags & ~DL_ADD_VALID_FLAGS ||
 	    (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
 	    (flags & DL_FLAG_SYNC_STATE_ONLY &&
-	     flags != DL_FLAG_SYNC_STATE_ONLY) ||
+	     (flags & ~DL_FLAG_INFERRED) != DL_FLAG_SYNC_STATE_ONLY) ||
 	    (flags & DL_FLAG_AUTOPROBE_CONSUMER &&
 	     flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
 		      DL_FLAG_AUTOREMOVE_SUPPLIER)))
@@ -671,6 +674,10 @@ struct device_link *device_link_add(struct device *consumer,
 		if (link->consumer != consumer)
 			continue;
 
+		if (link->flags & DL_FLAG_INFERRED &&
+		    !(flags & DL_FLAG_INFERRED))
+			link->flags &= ~DL_FLAG_INFERRED;
+
 		if (flags & DL_FLAG_PM_RUNTIME) {
 			if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
 				pm_runtime_new_link(consumer);
diff --git a/include/linux/device.h b/include/linux/device.h
index 89bb8b84173e..cb5eb2e58c25 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -323,6 +323,7 @@ enum device_link_state {
  * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
  * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
  * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
+ * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
  */
 #define DL_FLAG_STATELESS		BIT(0)
 #define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
@@ -332,6 +333,7 @@ enum device_link_state {
 #define DL_FLAG_AUTOPROBE_CONSUMER	BIT(5)
 #define DL_FLAG_MANAGED			BIT(6)
 #define DL_FLAG_SYNC_STATE_ONLY		BIT(7)
+#define DL_FLAG_INFERRED		BIT(8)
 
 /**
  * enum dl_dev_state - Device driver presence tracking information.
-- 
cgit v1.2.3


From 33cb6d1ed311af2c1dfd107fa334cfb51113ef35 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 28 Dec 2020 21:30:21 +0100
Subject: dmaengine: at_hdmac: remove platform data header

linux/platform_data/dma-atmel.h is only used by the at_hdmac driver. Move
the CFG bits definitions back in at_hdmac_regs.h and the remaining
definitions in the driver.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201228203022.2674133-1-alexandre.belloni@bootlin.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 MAINTAINERS                             |  1 -
 drivers/dma/at_hdmac.c                  | 19 ++++++++++
 drivers/dma/at_hdmac_regs.h             | 28 +++++++++++++--
 include/linux/platform_data/dma-atmel.h | 61 ---------------------------------
 4 files changed, 44 insertions(+), 65 deletions(-)
 delete mode 100644 include/linux/platform_data/dma-atmel.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 546aa66428c9..0d62310a31f8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11604,7 +11604,6 @@ F:	drivers/dma/at_hdmac.c
 F:	drivers/dma/at_hdmac_regs.h
 F:	drivers/dma/at_xdmac.c
 F:	include/dt-bindings/dma/at91.h
-F:	include/linux/platform_data/dma-atmel.h
 
 MICROCHIP AT91 SERIAL DRIVER
 M:	Richard Genoud <richard.genoud@gmail.com>
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 7eaee5b705b1..30ae36124b1d 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -54,6 +54,25 @@ module_param(init_nr_desc_per_channel, uint, 0644);
 MODULE_PARM_DESC(init_nr_desc_per_channel,
 		 "initial descriptors per channel (default: 64)");
 
+/**
+ * struct at_dma_platform_data - Controller configuration parameters
+ * @nr_channels: Number of channels supported by hardware (max 8)
+ * @cap_mask: dma_capability flags supported by the platform
+ */
+struct at_dma_platform_data {
+	unsigned int	nr_channels;
+	dma_cap_mask_t  cap_mask;
+};
+
+/**
+ * struct at_dma_slave - Controller-specific information about a slave
+ * @dma_dev: required DMA master device
+ * @cfg: Platform-specific initializer for the CFG register
+ */
+struct at_dma_slave {
+	struct device		*dma_dev;
+	u32			cfg;
+};
 
 /* prototypes */
 static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx);
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 80fc2fe8c77e..4d1ebc040031 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -7,8 +7,6 @@
 #ifndef AT_HDMAC_REGS_H
 #define	AT_HDMAC_REGS_H
 
-#include <linux/platform_data/dma-atmel.h>
-
 #define	AT_DMA_MAX_NR_CHANNELS	8
 
 
@@ -148,7 +146,31 @@
 #define	ATC_AUTO		(0x1 << 31)	/* Auto multiple buffer tx enable */
 
 /* Bitfields in CFG */
-/* are in at_hdmac.h */
+#define ATC_PER_MSB(h)	((0x30U & (h)) >> 4)	/* Extract most significant bits of a handshaking identifier */
+
+#define	ATC_SRC_PER(h)		(0xFU & (h))	/* Channel src rq associated with periph handshaking ifc h */
+#define	ATC_DST_PER(h)		((0xFU & (h)) <<  4)	/* Channel dst rq associated with periph handshaking ifc h */
+#define	ATC_SRC_REP		(0x1 <<  8)	/* Source Replay Mod */
+#define	ATC_SRC_H2SEL		(0x1 <<  9)	/* Source Handshaking Mod */
+#define		ATC_SRC_H2SEL_SW	(0x0 <<  9)
+#define		ATC_SRC_H2SEL_HW	(0x1 <<  9)
+#define	ATC_SRC_PER_MSB(h)	(ATC_PER_MSB(h) << 10)	/* Channel src rq (most significant bits) */
+#define	ATC_DST_REP		(0x1 << 12)	/* Destination Replay Mod */
+#define	ATC_DST_H2SEL		(0x1 << 13)	/* Destination Handshaking Mod */
+#define		ATC_DST_H2SEL_SW	(0x0 << 13)
+#define		ATC_DST_H2SEL_HW	(0x1 << 13)
+#define	ATC_DST_PER_MSB(h)	(ATC_PER_MSB(h) << 14)	/* Channel dst rq (most significant bits) */
+#define	ATC_SOD			(0x1 << 16)	/* Stop On Done */
+#define	ATC_LOCK_IF		(0x1 << 20)	/* Interface Lock */
+#define	ATC_LOCK_B		(0x1 << 21)	/* AHB Bus Lock */
+#define	ATC_LOCK_IF_L		(0x1 << 22)	/* Master Interface Arbiter Lock */
+#define		ATC_LOCK_IF_L_CHUNK	(0x0 << 22)
+#define		ATC_LOCK_IF_L_BUFFER	(0x1 << 22)
+#define	ATC_AHB_PROT_MASK	(0x7 << 24)	/* AHB Protection */
+#define	ATC_FIFOCFG_MASK	(0x3 << 28)	/* FIFO Request Configuration */
+#define		ATC_FIFOCFG_LARGESTBURST	(0x0 << 28)
+#define		ATC_FIFOCFG_HALFFIFO		(0x1 << 28)
+#define		ATC_FIFOCFG_ENOUGHSPACE		(0x2 << 28)
 
 /* Bitfields in SPIP */
 #define	ATC_SPIP_HOLE(x)	(0xFFFFU & (x))
diff --git a/include/linux/platform_data/dma-atmel.h b/include/linux/platform_data/dma-atmel.h
deleted file mode 100644
index 069637e6004f..000000000000
--- a/include/linux/platform_data/dma-atmel.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Header file for the Atmel AHB DMA Controller driver
- *
- * Copyright (C) 2008 Atmel Corporation
- */
-#ifndef AT_HDMAC_H
-#define AT_HDMAC_H
-
-#include <linux/dmaengine.h>
-
-/**
- * struct at_dma_platform_data - Controller configuration parameters
- * @nr_channels: Number of channels supported by hardware (max 8)
- * @cap_mask: dma_capability flags supported by the platform
- */
-struct at_dma_platform_data {
-	unsigned int	nr_channels;
-	dma_cap_mask_t  cap_mask;
-};
-
-/**
- * struct at_dma_slave - Controller-specific information about a slave
- * @dma_dev: required DMA master device
- * @cfg: Platform-specific initializer for the CFG register
- */
-struct at_dma_slave {
-	struct device		*dma_dev;
-	u32			cfg;
-};
-
-
-/* Platform-configurable bits in CFG */
-#define ATC_PER_MSB(h)	((0x30U & (h)) >> 4)	/* Extract most significant bits of a handshaking identifier */
-
-#define	ATC_SRC_PER(h)		(0xFU & (h))	/* Channel src rq associated with periph handshaking ifc h */
-#define	ATC_DST_PER(h)		((0xFU & (h)) <<  4)	/* Channel dst rq associated with periph handshaking ifc h */
-#define	ATC_SRC_REP		(0x1 <<  8)	/* Source Replay Mod */
-#define	ATC_SRC_H2SEL		(0x1 <<  9)	/* Source Handshaking Mod */
-#define		ATC_SRC_H2SEL_SW	(0x0 <<  9)
-#define		ATC_SRC_H2SEL_HW	(0x1 <<  9)
-#define	ATC_SRC_PER_MSB(h)	(ATC_PER_MSB(h) << 10)	/* Channel src rq (most significant bits) */
-#define	ATC_DST_REP		(0x1 << 12)	/* Destination Replay Mod */
-#define	ATC_DST_H2SEL		(0x1 << 13)	/* Destination Handshaking Mod */
-#define		ATC_DST_H2SEL_SW	(0x0 << 13)
-#define		ATC_DST_H2SEL_HW	(0x1 << 13)
-#define	ATC_DST_PER_MSB(h)	(ATC_PER_MSB(h) << 14)	/* Channel dst rq (most significant bits) */
-#define	ATC_SOD			(0x1 << 16)	/* Stop On Done */
-#define	ATC_LOCK_IF		(0x1 << 20)	/* Interface Lock */
-#define	ATC_LOCK_B		(0x1 << 21)	/* AHB Bus Lock */
-#define	ATC_LOCK_IF_L		(0x1 << 22)	/* Master Interface Arbiter Lock */
-#define		ATC_LOCK_IF_L_CHUNK	(0x0 << 22)
-#define		ATC_LOCK_IF_L_BUFFER	(0x1 << 22)
-#define	ATC_AHB_PROT_MASK	(0x7 << 24)	/* AHB Protection */
-#define	ATC_FIFOCFG_MASK	(0x3 << 28)	/* FIFO Request Configuration */
-#define		ATC_FIFOCFG_LARGESTBURST	(0x0 << 28)
-#define		ATC_FIFOCFG_HALFFIFO		(0x1 << 28)
-#define		ATC_FIFOCFG_ENOUGHSPACE		(0x2 << 28)
-
-
-#endif /* AT_HDMAC_H */
-- 
cgit v1.2.3


From 398cb92495cc1972014ffa81ff9cea1e5167b8df Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Fri, 13 Nov 2020 09:48:27 +0000
Subject: csky: Fixup perf probe failed

Current perf init will failed with:
[    1.452433] csky-pmu: probe of soc:pmu failed with error -16

This patch fix it up with adding CPUHP_AP_PERF_CSKY_ONLINE in
cpuhotplug.h.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 arch/csky/kernel/perf_event.c | 2 +-
 include/linux/cpuhotplug.h    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c
index 1a29f1157449..55d5a5379483 100644
--- a/arch/csky/kernel/perf_event.c
+++ b/arch/csky/kernel/perf_event.c
@@ -1319,7 +1319,7 @@ int csky_pmu_device_probe(struct platform_device *pdev,
 		pr_notice("[perf] PMU request irq fail!\n");
 	}
 
-	ret = cpuhp_setup_state(CPUHP_AP_PERF_ONLINE, "AP_PERF_ONLINE",
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_CSKY_ONLINE, "AP_PERF_ONLINE",
 				csky_pmu_starting_cpu,
 				csky_pmu_dying_cpu);
 	if (ret) {
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..d3e26dc81494 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -185,6 +185,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
+	CPUHP_AP_PERF_CSKY_ONLINE,
 	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
-- 
cgit v1.2.3


From 660343d063f7b7151a8c679d91ebe13cf40ad866 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 13 Jan 2021 13:49:21 +0200
Subject: dmaengine: Extend the dmaengine_alignment for 128 and 256 bytes

Some DMA device can benefit with higher order of alignment than the maximum
of 64 bytes currently defined.

Define 128 and 256 bytes alignment for these devices.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Tested-by: Kishon Vijay Abraham I <kishon@ti.com>
Link: https://lore.kernel.org/r/20210113114923.9231-2-peter.ujfalusi@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 68130f5f599e..004736b6a9c8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -745,6 +745,8 @@ enum dmaengine_alignment {
 	DMAENGINE_ALIGN_16_BYTES = 4,
 	DMAENGINE_ALIGN_32_BYTES = 5,
 	DMAENGINE_ALIGN_64_BYTES = 6,
+	DMAENGINE_ALIGN_128_BYTES = 7,
+	DMAENGINE_ALIGN_256_BYTES = 8,
 };
 
 /**
-- 
cgit v1.2.3


From 9d93a9e8aab3f82b6742dd034a6a81d4025cd82e Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:40 -0500
Subject: drivers/soc/litex: move generic accessors to litex.h

Move generic LiteX CSR (MMIO) register accessors to litex.h and
declare them as "static inline", in preparation for supporting
32-bit CSR subregisters and 64-bit CPUs.

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/litex_soc_ctrl.c | 73 -------------------------------------
 include/linux/litex.h              | 74 +++++++++++++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
index 1217cafdfd4d..65977526d68e 100644
--- a/drivers/soc/litex/litex_soc_ctrl.c
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -16,79 +16,6 @@
 #include <linux/errno.h>
 #include <linux/io.h>
 
-/*
- * LiteX SoC Generator, depending on the configuration, can split a single
- * logical CSR (Control&Status Register) into a series of consecutive physical
- * registers.
- *
- * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
- * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
- * 32-bit physical registers, each one containing one byte of meaningful data.
- *
- * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
- *
- * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
- * of writing to/reading from the LiteX CSR in a single place that can be
- * then reused by all LiteX drivers.
- */
-
-/**
- * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- * @val: Value to be written to the CSR
- *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function splits a single possibly multi-byte write into a series of
- * single-byte writes with a proper offset.
- */
-void litex_set_reg(void __iomem *reg, unsigned long reg_size,
-		    unsigned long val)
-{
-	unsigned long shifted_data, shift, i;
-
-	for (i = 0; i < reg_size; ++i) {
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		shifted_data = val >> shift;
-
-		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
-	}
-}
-EXPORT_SYMBOL_GPL(litex_set_reg);
-
-/**
- * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- *
- * Return: Value read from the CSR
- *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function generates a series of single-byte reads with a proper offset
- * and joins their results into a single multi-byte value.
- */
-unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_size)
-{
-	unsigned long shifted_data, shift, i;
-	unsigned long result = 0;
-
-	for (i = 0; i < reg_size; ++i) {
-		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
-
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		result |= (shifted_data << shift);
-	}
-
-	return result;
-}
-EXPORT_SYMBOL_GPL(litex_get_reg);
-
 #define SCRATCH_REG_OFF         0x04
 #define SCRATCH_REG_VALUE       0x12345678
 #define SCRATCH_TEST_VALUE      0xdeadbeef
diff --git a/include/linux/litex.h b/include/linux/litex.h
index 40f5be503593..67c1a18a7425 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -3,9 +3,6 @@
  * Common LiteX header providing
  * helper functions for accessing CSRs.
  *
- * Implementation of the functions is provided by
- * the LiteX SoC Controller driver.
- *
  * Copyright (C) 2019-2020 Antmicro <www.antmicro.com>
  */
 
@@ -33,9 +30,76 @@
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
 	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
 
-void litex_set_reg(void __iomem *reg, unsigned long reg_sz, unsigned long val);
+/*
+ * LiteX SoC Generator, depending on the configuration, can split a single
+ * logical CSR (Control&Status Register) into a series of consecutive physical
+ * registers.
+ *
+ * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
+ * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
+ * 32-bit physical registers, each one containing one byte of meaningful data.
+ *
+ * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
+ *
+ * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
+ * of writing to/reading from the LiteX CSR in a single place that can be
+ * then reused by all LiteX drivers.
+ */
+
+/**
+ * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ * @val: Value to be written to the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function splits a single possibly multi-byte write into a series of
+ * single-byte writes with a proper offset.
+ */
+static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
+{
+	ulong shifted_data, shift, i;
+
+	for (i = 0; i < reg_size; ++i) {
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		shifted_data = val >> shift;
+
+		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	}
+}
+
+/**
+ * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ *
+ * Return: Value read from the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function generates a series of single-byte reads with a proper offset
+ * and joins their results into a single multi-byte value.
+ */
+static inline ulong litex_get_reg(void __iomem *reg, ulong reg_size)
+{
+	ulong shifted_data, shift, i;
+	ulong result = 0;
+
+	for (i = 0; i < reg_size; ++i) {
+		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
+
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		result |= (shifted_data << shift);
+	}
+
+	return result;
+}
 
-unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_sz);
 
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-- 
cgit v1.2.3


From b5d3061ea2e691ab1fa6465fce3c59d9d10357de Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:41 -0500
Subject: drivers/soc/litex: separate MMIO from subregister offset calculation

Separate MMIO (read/write) access into _[read|write]_litex_subregister()
static inline functions, leaving existing "READ|WRITE" macros to handle
calculation of the subregister offset only.

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 include/linux/litex.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/litex.h b/include/linux/litex.h
index 67c1a18a7425..918bab45243c 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -24,11 +24,23 @@
 #define LITEX_SUBREG_SIZE	0x1
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
+static inline void _write_litex_subregister(u32 val, void __iomem *addr)
+{
+	writel((u32 __force)cpu_to_le32(val), addr);
+}
+
+static inline u32 _read_litex_subregister(void __iomem *addr)
+{
+	return le32_to_cpu((__le32 __force)readl(addr));
+}
+
 #define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
-	writel((u32 __force)cpu_to_le32(val), base_offset + (LITEX_REG_SIZE * subreg_id))
+	_write_litex_subregister(val, (base_offset) + \
+					LITEX_REG_SIZE * (subreg_id))
 
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
-	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
+	_read_litex_subregister((base_offset) + \
+					LITEX_REG_SIZE * (subreg_id))
 
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
-- 
cgit v1.2.3


From ffa4ebc48971abffed722b75887ac1d8c9256b41 Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:42 -0500
Subject: drivers/soc/litex: s/LITEX_REG_SIZE/LITEX_SUBREG_ALIGN/g

The constant LITEX_REG_SIZE is renamed to the more descriptive
LITEX_SUBREG_ALIGN (LiteX CSR subregisters are located at 32-bit
aligned MMIO addresses).

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 include/linux/litex.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/litex.h b/include/linux/litex.h
index 918bab45243c..c63a7e1a337c 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -20,10 +20,12 @@
  * Supporting other configurations will require extending the logic in this
  * header and in the LiteX SoC controller driver.
  */
-#define LITEX_REG_SIZE	  0x4
 #define LITEX_SUBREG_SIZE	0x1
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
+/* LiteX subregisters of any width are always aligned on a 4-byte boundary */
+#define LITEX_SUBREG_ALIGN	  0x4
+
 static inline void _write_litex_subregister(u32 val, void __iomem *addr)
 {
 	writel((u32 __force)cpu_to_le32(val), addr);
@@ -36,11 +38,11 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 
 #define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
 	_write_litex_subregister(val, (base_offset) + \
-					LITEX_REG_SIZE * (subreg_id))
+					LITEX_SUBREG_ALIGN * (subreg_id))
 
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
 	_read_litex_subregister((base_offset) + \
-					LITEX_REG_SIZE * (subreg_id))
+					LITEX_SUBREG_ALIGN * (subreg_id))
 
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
-- 
cgit v1.2.3


From 51f109228308a87c7f2583360e54acfc567203da Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:43 -0500
Subject: drivers/soc/litex: support 32-bit subregisters, 64-bit CPUs

Upstream LiteX now defaults to using 32-bit CSR subregisters
(see https://github.com/enjoy-digital/litex/commit/a2b71fde).

This patch expands on commit 22447a99c97e ("drivers/soc/litex: add
LiteX SoC Controller driver"), adding support for handling both 8-
and 32-bit LiteX CSR (MMIO) subregisters, as determined by the
LITEX_SUBREG_SIZE Kconfig option.

NOTE that while LITEX_SUBREG_SIZE could theoretically be a device
tree property, defining it as a compile-time constant allows for
much better optimization of the resulting code. This is further
supported by the low expected usefulness of deploying the same
kernel across LiteX SoCs built with different CSR-Bus data widths.

Finally, the litex_[read|write][8|16|32|64]() accessors are
redefined in terms of litex_[get|set]_reg(), which, after compiler
optimization, will result in code as efficient as hardcoded shifts,
but with the added benefit of automatically matching the appropriate
LITEX_SUBREG_SIZE.

NOTE that litex_[get|set]_reg() nominally operate on 64-bit data,
but that will also be optimized by the compiler in situations where
narrower data is used from a call site.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/Kconfig          |  12 ++++
 drivers/soc/litex/litex_soc_ctrl.c |   3 +-
 include/linux/litex.h              | 137 +++++++++++++++----------------------
 3 files changed, 68 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
index 7c6b009b6f6c..973f8d2fe1a7 100644
--- a/drivers/soc/litex/Kconfig
+++ b/drivers/soc/litex/Kconfig
@@ -16,4 +16,16 @@ config LITEX_SOC_CONTROLLER
 	  All drivers that use functions from litex.h must depend on
 	  LITEX.
 
+config LITEX_SUBREG_SIZE
+	int "Size of a LiteX CSR subregister, in bytes"
+	depends on LITEX
+	range 1 4
+	default 4
+	help
+	LiteX MMIO registers (referred to as Configuration and Status
+	registers, or CSRs) are spread across adjacent 8- or 32-bit
+	subregisters, located at 32-bit aligned MMIO addresses. Use
+	this to select the appropriate size (1 or 4 bytes) matching
+	your particular LiteX build.
+
 endmenu
diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
index 65977526d68e..da17ba56b795 100644
--- a/drivers/soc/litex/litex_soc_ctrl.c
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -58,7 +58,8 @@ static int litex_check_csr_access(void __iomem *reg_addr)
 	/* restore original value of the SCRATCH register */
 	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE);
 
-	pr_info("LiteX SoC Controller driver initialized");
+	pr_info("LiteX SoC Controller driver initialized: subreg:%d, align:%d",
+		LITEX_SUBREG_SIZE, LITEX_SUBREG_ALIGN);
 
 	return 0;
 }
diff --git a/include/linux/litex.h b/include/linux/litex.h
index c63a7e1a337c..3456d527f644 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -10,17 +10,14 @@
 #define _LINUX_LITEX_H
 
 #include <linux/io.h>
-#include <linux/types.h>
-#include <linux/compiler_types.h>
 
-/*
- * The parameters below are true for LiteX SoCs configured for 8-bit CSR Bus,
- * 32-bit aligned.
- *
- * Supporting other configurations will require extending the logic in this
- * header and in the LiteX SoC controller driver.
- */
-#define LITEX_SUBREG_SIZE	0x1
+/* LiteX SoCs support 8- or 32-bit CSR Bus data width (i.e., subreg. size) */
+#if defined(CONFIG_LITEX_SUBREG_SIZE) && \
+	(CONFIG_LITEX_SUBREG_SIZE == 1 || CONFIG_LITEX_SUBREG_SIZE == 4)
+#define LITEX_SUBREG_SIZE      CONFIG_LITEX_SUBREG_SIZE
+#else
+#error LiteX subregister size (LITEX_SUBREG_SIZE) must be 4 or 1!
+#endif
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
 /* LiteX subregisters of any width are always aligned on a 4-byte boundary */
@@ -36,25 +33,32 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 	return le32_to_cpu((__le32 __force)readl(addr));
 }
 
-#define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
-	_write_litex_subregister(val, (base_offset) + \
-					LITEX_SUBREG_ALIGN * (subreg_id))
-
-#define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
-	_read_litex_subregister((base_offset) + \
-					LITEX_SUBREG_ALIGN * (subreg_id))
-
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
  * logical CSR (Control&Status Register) into a series of consecutive physical
  * registers.
  *
- * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
- * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
- * 32-bit physical registers, each one containing one byte of meaningful data.
+ * For example, in the configuration with 8-bit CSR Bus, a 32-bit aligned,
+ * 32-bit wide logical CSR will be laid out as four 32-bit physical
+ * subregisters, each one containing one byte of meaningful data.
  *
  * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
- *
+ */
+
+/* number of LiteX subregisters needed to store a register of given reg_size */
+#define _litex_num_subregs(reg_size) \
+	(((reg_size) - 1) / LITEX_SUBREG_SIZE + 1)
+
+/*
+ * since the number of 4-byte aligned subregisters required to store a single
+ * LiteX CSR (MMIO) register varies with LITEX_SUBREG_SIZE, the offset of the
+ * next adjacent LiteX CSR register w.r.t. the offset of the current one also
+ * depends on how many subregisters the latter is spread across
+ */
+#define _next_reg_off(off, size) \
+	((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN)
+
+/*
  * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
  * of writing to/reading from the LiteX CSR in a single place that can be
  * then reused by all LiteX drivers.
@@ -66,22 +70,17 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
  * @reg_size: The width of the CSR expressed in the number of bytes
  * @val: Value to be written to the CSR
  *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function splits a single possibly multi-byte write into a series of
- * single-byte writes with a proper offset.
+ * This function splits a single (possibly multi-byte) LiteX CSR write into
+ * a series of subregister writes with a proper offset.
  */
-static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
+static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 {
-	ulong shifted_data, shift, i;
-
-	for (i = 0; i < reg_size; ++i) {
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		shifted_data = val >> shift;
+	u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT;
 
-		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	while (shift > 0) {
+		shift -= LITEX_SUBREG_SIZE_BIT;
+		_write_litex_subregister(val >> shift, reg);
+		reg += LITEX_SUBREG_ALIGN;
 	}
 }
 
@@ -92,89 +91,61 @@ static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
  *
  * Return: Value read from the CSR
  *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function generates a series of single-byte reads with a proper offset
- * and joins their results into a single multi-byte value.
+ * This function generates a series of subregister reads with a proper offset
+ * and joins their results into a single (possibly multi-byte) LiteX CSR value.
  */
-static inline ulong litex_get_reg(void __iomem *reg, ulong reg_size)
+static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
 {
-	ulong shifted_data, shift, i;
-	ulong result = 0;
-
-	for (i = 0; i < reg_size; ++i) {
-		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
-
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		result |= (shifted_data << shift);
+	u64 r;
+	u8 i;
+
+	r = _read_litex_subregister(reg);
+	for (i = 1; i < _litex_num_subregs(reg_size); i++) {
+		r <<= LITEX_SUBREG_SIZE_BIT;
+		reg += LITEX_SUBREG_ALIGN;
+		r |= _read_litex_subregister(reg);
 	}
-
-	return result;
+	return r;
 }
 
-
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-	WRITE_LITEX_SUBREGISTER(val, reg, 0);
+	litex_set_reg(reg, sizeof(u8), val);
 }
 
 static inline void litex_write16(void __iomem *reg, u16 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val, reg, 1);
+	litex_set_reg(reg, sizeof(u16), val);
 }
 
 static inline void litex_write32(void __iomem *reg, u32 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 1);
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 2);
-	WRITE_LITEX_SUBREGISTER(val, reg, 3);
+	litex_set_reg(reg, sizeof(u32), val);
 }
 
 static inline void litex_write64(void __iomem *reg, u64 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 56, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val >> 48, reg, 1);
-	WRITE_LITEX_SUBREGISTER(val >> 40, reg, 2);
-	WRITE_LITEX_SUBREGISTER(val >> 32, reg, 3);
-	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 4);
-	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 5);
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 6);
-	WRITE_LITEX_SUBREGISTER(val, reg, 7);
+	litex_set_reg(reg, sizeof(u64), val);
 }
 
 static inline u8 litex_read8(void __iomem *reg)
 {
-	return READ_LITEX_SUBREGISTER(reg, 0);
+	return litex_get_reg(reg, sizeof(u8));
 }
 
 static inline u16 litex_read16(void __iomem *reg)
 {
-	return (READ_LITEX_SUBREGISTER(reg, 0) << 8)
-		| (READ_LITEX_SUBREGISTER(reg, 1));
+	return litex_get_reg(reg, sizeof(u16));
 }
 
 static inline u32 litex_read32(void __iomem *reg)
 {
-	return (READ_LITEX_SUBREGISTER(reg, 0) << 24)
-		| (READ_LITEX_SUBREGISTER(reg, 1) << 16)
-		| (READ_LITEX_SUBREGISTER(reg, 2) << 8)
-		| (READ_LITEX_SUBREGISTER(reg, 3));
+	return litex_get_reg(reg, sizeof(u32));
 }
 
 static inline u64 litex_read64(void __iomem *reg)
 {
-	return ((u64)READ_LITEX_SUBREGISTER(reg, 0) << 56)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 1) << 48)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 2) << 40)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 3) << 32)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 4) << 24)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 5) << 16)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 6) << 8)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 7));
+	return litex_get_reg(reg, sizeof(u64));
 }
 
 #endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From 4f70d150294b3ddfbe4be7130ca53898cd5b91be Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:44 -0500
Subject: drivers/soc/litex: make 'litex_[set|get]_reg()' methods private

The 'litex_[set|get]_reg()' methods use the 'reg_size' parameter to
specify the width of the LiteX CSR (MMIO) register being accessed.

Since 'u64' is the widest data being supported, the value of 'reg_size'
MUST be between 1 and sizeof(u64), which SHOULD be checked at runtime
if these methods are publicly available for use by other LiteX device
drivers.

At the same time, none of the existing (or foreseeable) LiteX device
drivers have a need to access registers whose size is unknown during
compilation. As such, all LiteX device drivers should use fixed-width
accessor methods such as 'litex_[write|read][8|16|32|64]()'.

This patch renames 'litex_[set|get]_reg()' to '_litex_[set|get]_reg()',
indicating that they should NOT be directly called from outside of
the 'include/linux/litex.h' header file.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/Kconfig |  2 +-
 include/linux/litex.h     | 35 ++++++++++++++++++++---------------
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
index 973f8d2fe1a7..b9b3d51ea7df 100644
--- a/drivers/soc/litex/Kconfig
+++ b/drivers/soc/litex/Kconfig
@@ -11,7 +11,7 @@ config LITEX_SOC_CONTROLLER
 	select LITEX
 	help
 	  This option enables the SoC Controller Driver which verifies
-	  LiteX CSR access and provides common litex_get_reg/litex_set_reg
+	  LiteX CSR access and provides common litex_[read|write]*
 	  accessors.
 	  All drivers that use functions from litex.h must depend on
 	  LITEX.
diff --git a/include/linux/litex.h b/include/linux/litex.h
index 3456d527f644..5ea9ccf5cce4 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -59,21 +59,25 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 	((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN)
 
 /*
- * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
- * of writing to/reading from the LiteX CSR in a single place that can be
- * then reused by all LiteX drivers.
+ * The purpose of `_litex_[set|get]_reg()` is to implement the logic of
+ * writing to/reading from the LiteX CSR in a single place that can be then
+ * reused by all LiteX drivers via the `litex_[write|read][8|16|32|64]()`
+ * accessors for the appropriate data width.
+ * NOTE: direct use of `_litex_[set|get]_reg()` by LiteX drivers is strongly
+ * discouraged, as they perform no error checking on the requested data width!
  */
 
 /**
- * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * _litex_set_reg() - Writes a value to the LiteX CSR (Control&Status Register)
  * @reg: Address of the CSR
  * @reg_size: The width of the CSR expressed in the number of bytes
  * @val: Value to be written to the CSR
  *
  * This function splits a single (possibly multi-byte) LiteX CSR write into
  * a series of subregister writes with a proper offset.
+ * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
  */
-static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
+static inline void _litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 {
 	u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT;
 
@@ -85,7 +89,7 @@ static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 }
 
 /**
- * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * _litex_get_reg() - Reads a value of the LiteX CSR (Control&Status Register)
  * @reg: Address of the CSR
  * @reg_size: The width of the CSR expressed in the number of bytes
  *
@@ -93,8 +97,9 @@ static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
  *
  * This function generates a series of subregister reads with a proper offset
  * and joins their results into a single (possibly multi-byte) LiteX CSR value.
+ * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
  */
-static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
+static inline u64 _litex_get_reg(void __iomem *reg, size_t reg_size)
 {
 	u64 r;
 	u8 i;
@@ -110,42 +115,42 @@ static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
 
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-	litex_set_reg(reg, sizeof(u8), val);
+	_litex_set_reg(reg, sizeof(u8), val);
 }
 
 static inline void litex_write16(void __iomem *reg, u16 val)
 {
-	litex_set_reg(reg, sizeof(u16), val);
+	_litex_set_reg(reg, sizeof(u16), val);
 }
 
 static inline void litex_write32(void __iomem *reg, u32 val)
 {
-	litex_set_reg(reg, sizeof(u32), val);
+	_litex_set_reg(reg, sizeof(u32), val);
 }
 
 static inline void litex_write64(void __iomem *reg, u64 val)
 {
-	litex_set_reg(reg, sizeof(u64), val);
+	_litex_set_reg(reg, sizeof(u64), val);
 }
 
 static inline u8 litex_read8(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u8));
+	return _litex_get_reg(reg, sizeof(u8));
 }
 
 static inline u16 litex_read16(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u16));
+	return _litex_get_reg(reg, sizeof(u16));
 }
 
 static inline u32 litex_read32(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u32));
+	return _litex_get_reg(reg, sizeof(u32));
 }
 
 static inline u64 litex_read64(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u64));
+	return _litex_get_reg(reg, sizeof(u64));
 }
 
 #endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From dc5723b02e523b2c4a68667f7e28c65018f7202f Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:19 -0800
Subject: kbuild: add support for Clang LTO

This change adds build system support for Clang's Link Time
Optimization (LTO). With -flto, instead of ELF object files, Clang
produces LLVM bitcode, which is compiled into native code at link
time, allowing the final binary to be optimized globally. For more
details, see:

  https://llvm.org/docs/LinkTimeOptimization.html

The Kconfig option CONFIG_LTO_CLANG is implemented as a choice,
which defaults to LTO being disabled. To use LTO, the architecture
must select ARCH_SUPPORTS_LTO_CLANG and support:

  - compiling with Clang,
  - compiling all assembly code with Clang's integrated assembler,
  - and linking with LLD.

While using CONFIG_LTO_CLANG_FULL results in the best runtime
performance, the compilation is not scalable in time or
memory. CONFIG_LTO_CLANG_THIN enables ThinLTO, which allows
parallel optimization and faster incremental builds. ThinLTO is
used by default if the architecture also selects
ARCH_SUPPORTS_LTO_CLANG_THIN:

  https://clang.llvm.org/docs/ThinLTO.html

To enable LTO, LLVM tools must be used to handle bitcode files, by
passing LLVM=1 and LLVM_IAS=1 options to make:

  $ make LLVM=1 LLVM_IAS=1 defconfig
  $ scripts/config -e LTO_CLANG_THIN
  $ make LLVM=1 LLVM_IAS=1

To prepare for LTO support with other compilers, common parts are
gated behind the CONFIG_LTO option, and LTO can be disabled for
specific files by filtering out CC_FLAGS_LTO.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-3-samitolvanen@google.com
---
 Makefile                          | 19 +++++++-
 arch/Kconfig                      | 91 +++++++++++++++++++++++++++++++++++++++
 include/asm-generic/vmlinux.lds.h | 11 +++--
 scripts/Makefile.build            |  9 +++-
 scripts/Makefile.modfinal         |  9 +++-
 scripts/Makefile.modpost          | 21 ++++++++-
 scripts/link-vmlinux.sh           | 32 ++++++++++----
 7 files changed, 174 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Makefile b/Makefile
index b434c342080b..2b9bcecd6ea1 100644
--- a/Makefile
+++ b/Makefile
@@ -893,6 +893,21 @@ KBUILD_CFLAGS	+= $(CC_FLAGS_SCS)
 export CC_FLAGS_SCS
 endif
 
+ifdef CONFIG_LTO_CLANG
+ifdef CONFIG_LTO_CLANG_THIN
+CC_FLAGS_LTO	+= -flto=thin -fsplit-lto-unit
+KBUILD_LDFLAGS	+= --thinlto-cache-dir=$(extmod-prefix).thinlto-cache
+else
+CC_FLAGS_LTO	+= -flto
+endif
+CC_FLAGS_LTO	+= -fvisibility=hidden
+endif
+
+ifdef CONFIG_LTO
+KBUILD_CFLAGS	+= $(CC_FLAGS_LTO)
+export CC_FLAGS_LTO
+endif
+
 ifdef CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B
 KBUILD_CFLAGS += -falign-functions=32
 endif
@@ -1479,7 +1494,7 @@ MRPROPER_FILES += include/config include/generated          \
 		  *.spec
 
 # Directories & files removed with 'make distclean'
-DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS
+DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS .thinlto-cache
 
 # clean - Delete most, but leave enough to build external modules
 #
@@ -1725,7 +1740,7 @@ PHONY += compile_commands.json
 
 clean-dirs := $(KBUILD_EXTMOD)
 clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers $(KBUILD_EXTMOD)/modules.nsdeps \
-	$(KBUILD_EXTMOD)/compile_commands.json
+	$(KBUILD_EXTMOD)/compile_commands.json $(KBUILD_EXTMOD)/.thinlto-cache
 
 PHONY += help
 help:
diff --git a/arch/Kconfig b/arch/Kconfig
index 78c6f05b10f9..9494e3931f4b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -631,6 +631,97 @@ config SHADOW_CALL_STACK
 	  reading and writing arbitrary memory may be able to locate them
 	  and hijack control flow by modifying the stacks.
 
+config LTO
+	bool
+	help
+	  Selected if the kernel will be built using the compiler's LTO feature.
+
+config LTO_CLANG
+	bool
+	select LTO
+	help
+	  Selected if the kernel will be built using Clang's LTO feature.
+
+config ARCH_SUPPORTS_LTO_CLANG
+	bool
+	help
+	  An architecture should select this option if it supports:
+	  - compiling with Clang,
+	  - compiling inline assembly with Clang's integrated assembler,
+	  - and linking with LLD.
+
+config ARCH_SUPPORTS_LTO_CLANG_THIN
+	bool
+	help
+	  An architecture should select this option if it can support Clang's
+	  ThinLTO mode.
+
+config HAS_LTO_CLANG
+	def_bool y
+	# Clang >= 11: https://github.com/ClangBuiltLinux/linux/issues/510
+	depends on CC_IS_CLANG && CLANG_VERSION >= 110000 && LD_IS_LLD
+	depends on $(success,test $(LLVM) -eq 1)
+	depends on $(success,test $(LLVM_IAS) -eq 1)
+	depends on $(success,$(NM) --help | head -n 1 | grep -qi llvm)
+	depends on $(success,$(AR) --help | head -n 1 | grep -qi llvm)
+	depends on ARCH_SUPPORTS_LTO_CLANG
+	depends on !FTRACE_MCOUNT_USE_RECORDMCOUNT
+	depends on !KASAN
+	depends on !GCOV_KERNEL
+	depends on !MODVERSIONS
+	help
+	  The compiler and Kconfig options support building with Clang's
+	  LTO.
+
+choice
+	prompt "Link Time Optimization (LTO)"
+	default LTO_NONE
+	help
+	  This option enables Link Time Optimization (LTO), which allows the
+	  compiler to optimize binaries globally.
+
+	  If unsure, select LTO_NONE. Note that LTO is very resource-intensive
+	  so it's disabled by default.
+
+config LTO_NONE
+	bool "None"
+	help
+	  Build the kernel normally, without Link Time Optimization (LTO).
+
+config LTO_CLANG_FULL
+	bool "Clang Full LTO (EXPERIMENTAL)"
+	depends on HAS_LTO_CLANG
+	depends on !COMPILE_TEST
+	select LTO_CLANG
+	help
+          This option enables Clang's full Link Time Optimization (LTO), which
+          allows the compiler to optimize the kernel globally. If you enable
+          this option, the compiler generates LLVM bitcode instead of ELF
+          object files, and the actual compilation from bitcode happens at
+          the LTO link step, which may take several minutes depending on the
+          kernel configuration. More information can be found from LLVM's
+          documentation:
+
+	    https://llvm.org/docs/LinkTimeOptimization.html
+
+	  During link time, this option can use a large amount of RAM, and
+	  may take much longer than the ThinLTO option.
+
+config LTO_CLANG_THIN
+	bool "Clang ThinLTO (EXPERIMENTAL)"
+	depends on HAS_LTO_CLANG && ARCH_SUPPORTS_LTO_CLANG_THIN
+	select LTO_CLANG
+	help
+	  This option enables Clang's ThinLTO, which allows for parallel
+	  optimization and faster incremental compiles compared to the
+	  CONFIG_LTO_CLANG_FULL option. More information can be found
+	  from Clang's documentation:
+
+	    https://clang.llvm.org/docs/ThinLTO.html
+
+	  If unsure, say Y.
+endchoice
+
 config HAVE_ARCH_WITHIN_STACK_FRAMES
 	bool
 	help
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..8988a2e445d8 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -90,15 +90,18 @@
  * .data. We don't want to pull in .data..other sections, which Linux
  * has defined. Same for text and bss.
  *
+ * With LTO_CLANG, the linker also splits sections by default, so we need
+ * these macros to combine the sections during the final link.
+ *
  * RODATA_MAIN is not used because existing code already defines .rodata.x
  * sections to be brought in with rodata.
  */
-#ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
+#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG)
 #define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
-#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..LPBX*
+#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral*
 #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]*
-#define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]*
-#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]*
+#define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L*
+#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..compoundliteral*
 #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]*
 #else
 #define TEXT_MAIN .text
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 22654a463ad8..65d4bea937fa 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -111,7 +111,7 @@ endif
 # ---------------------------------------------------------------------------
 
 quiet_cmd_cc_s_c = CC $(quiet_modtag)  $@
-      cmd_cc_s_c = $(CC) $(filter-out $(DEBUG_CFLAGS), $(c_flags)) -fverbose-asm -S -o $@ $<
+      cmd_cc_s_c = $(CC) $(filter-out $(DEBUG_CFLAGS) $(CC_FLAGS_LTO), $(c_flags)) -fverbose-asm -S -o $@ $<
 
 $(obj)/%.s: $(src)/%.c FORCE
 	$(call if_changed_dep,cc_s_c)
@@ -421,8 +421,15 @@ $(obj)/lib.a: $(lib-y) FORCE
 # Do not replace $(filter %.o,^) with $(real-prereqs). When a single object
 # module is turned into a multi object module, $^ will contain header file
 # dependencies recorded in the .*.cmd file.
+ifdef CONFIG_LTO_CLANG
+quiet_cmd_link_multi-m = AR [M]  $@
+cmd_link_multi-m =						\
+	rm -f $@; 						\
+	$(AR) cDPrsT $@ $(filter %.o,$^)
+else
 quiet_cmd_link_multi-m = LD [M]  $@
       cmd_link_multi-m = $(LD) $(ld_flags) -r -o $@ $(filter %.o,$^)
+endif
 
 $(multi-used-m): FORCE
 	$(call if_changed,link_multi-m)
diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
index d49ec001825d..6de2c35b64e8 100644
--- a/scripts/Makefile.modfinal
+++ b/scripts/Makefile.modfinal
@@ -30,6 +30,12 @@ quiet_cmd_cc_o_c = CC [M]  $@
 
 ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink)
 
+ifdef CONFIG_LTO_CLANG
+# With CONFIG_LTO_CLANG, reuse the object file we compiled for modpost to
+# avoid a second slow LTO link
+prelink-ext := .lto
+endif
+
 quiet_cmd_ld_ko_o = LD [M]  $@
       cmd_ld_ko_o =                                                     \
 	$(LD) -r $(KBUILD_LDFLAGS)					\
@@ -53,8 +59,9 @@ if_changed_except = $(if $(call newer_prereqs_except,$(2))$(cmd-check),      \
 	$(cmd);                                                              \
 	printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:)
 
+
 # Re-generate module BTFs if either module's .ko or vmlinux changed
-$(modules): %.ko: %.o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE
+$(modules): %.ko: %$(prelink-ext).o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE
 	+$(call if_changed_except,ld_ko_o,vmlinux)
 ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	+$(if $(newer-prereqs),$(call cmd,btf_ko))
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index f54b6ac37ac2..9ff8bfdb574d 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -43,6 +43,9 @@ __modpost:
 include include/config/auto.conf
 include scripts/Kbuild.include
 
+# for ld_flags
+include scripts/Makefile.lib
+
 MODPOST = scripts/mod/modpost								\
 	$(if $(CONFIG_MODVERSIONS),-m)							\
 	$(if $(CONFIG_MODULE_SRCVERSION_ALL),-a)					\
@@ -102,12 +105,26 @@ $(input-symdump):
 	@echo >&2 'WARNING: Symbol version dump "$@" is missing.'
 	@echo >&2 '         Modules may not have dependencies or modversions.'
 
+ifdef CONFIG_LTO_CLANG
+# With CONFIG_LTO_CLANG, .o files might be LLVM bitcode, so we need to run
+# LTO to compile them into native code before running modpost
+prelink-ext := .lto
+
+quiet_cmd_cc_lto_link_modules = LTO [M] $@
+cmd_cc_lto_link_modules = $(LD) $(ld_flags) -r -o $@ --whole-archive $^
+
+%.lto.o: %.o
+	$(call if_changed,cc_lto_link_modules)
+endif
+
+modules := $(sort $(shell cat $(MODORDER)))
+
 # Read out modules.order to pass in modpost.
 # Otherwise, allmodconfig would fail with "Argument list too long".
 quiet_cmd_modpost = MODPOST $@
-      cmd_modpost = sed 's/ko$$/o/' $< | $(MODPOST) -T -
+      cmd_modpost = sed 's/\.ko$$/$(prelink-ext)\.o/' $< | $(MODPOST) -T -
 
-$(output-symdump): $(MODORDER) $(input-symdump) FORCE
+$(output-symdump): $(MODORDER) $(input-symdump) $(modules:.ko=$(prelink-ext).o) FORCE
 	$(call if_changed,modpost)
 
 targets += $(output-symdump)
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 6eded325c837..596507573a48 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -56,6 +56,14 @@ modpost_link()
 		${KBUILD_VMLINUX_LIBS}				\
 		--end-group"
 
+	if [ -n "${CONFIG_LTO_CLANG}" ]; then
+		# This might take a while, so indicate that we're doing
+		# an LTO link
+		info LTO ${1}
+	else
+		info LD ${1}
+	fi
+
 	${LD} ${KBUILD_LDFLAGS} -r -o ${1} ${objects}
 }
 
@@ -103,13 +111,22 @@ vmlinux_link()
 	fi
 
 	if [ "${SRCARCH}" != "um" ]; then
-		objects="--whole-archive			\
-			${KBUILD_VMLINUX_OBJS}			\
-			--no-whole-archive			\
-			--start-group				\
-			${KBUILD_VMLINUX_LIBS}			\
-			--end-group				\
-			${@}"
+		if [ -n "${CONFIG_LTO_CLANG}" ]; then
+			# Use vmlinux.o instead of performing the slow LTO
+			# link again.
+			objects="--whole-archive		\
+				vmlinux.o 			\
+				--no-whole-archive		\
+				${@}"
+		else
+			objects="--whole-archive		\
+				${KBUILD_VMLINUX_OBJS}		\
+				--no-whole-archive		\
+				--start-group			\
+				${KBUILD_VMLINUX_LIBS}		\
+				--end-group			\
+				${@}"
+		fi
 
 		${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux}	\
 			${strip_debug#-Wl,}			\
@@ -274,7 +291,6 @@ fi;
 ${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init need-builtin=1
 
 #link vmlinux.o
-info LD vmlinux.o
 modpost_link vmlinux.o
 objtool_link vmlinux.o
 
-- 
cgit v1.2.3


From a8cccdd954732a558d481407ab7c3106b89c34ae Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:24 -0800
Subject: init: lto: ensure initcall ordering

With LTO, the compiler doesn't necessarily obey the link order for
initcalls, and initcall variables need globally unique names to avoid
collisions at link time.

This change exports __KBUILD_MODNAME and adds the initcall_id() macro,
which uses it together with __COUNTER__ and __LINE__ to help ensure
these variables have unique names, and moves each variable to its own
section when LTO is enabled, so the correct order can be specified using
a linker script.

The generate_initcall_ordering.pl script uses nm to find initcalls from
the object files passed to the linker, and generates a linker script
that specifies the same order for initcalls that we would have without
LTO. With LTO enabled, the script is called in link-vmlinux.sh through
jobserver-exec to limit the number of jobs spawned.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-8-samitolvanen@google.com
---
 include/linux/init.h               |  52 ++++++-
 scripts/Makefile.lib               |   6 +-
 scripts/generate_initcall_order.pl | 270 +++++++++++++++++++++++++++++++++++++
 scripts/link-vmlinux.sh            |  15 +++
 4 files changed, 334 insertions(+), 9 deletions(-)
 create mode 100755 scripts/generate_initcall_order.pl

(limited to 'include')

diff --git a/include/linux/init.h b/include/linux/init.h
index e668832ef66a..a57bf0dfd75f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -184,19 +184,57 @@ extern bool initcall_debug;
  * as KEEP() in the linker script.
  */
 
+/* Format: <modname>__<counter>_<line>_<fn> */
+#define __initcall_id(fn)					\
+	__PASTE(__KBUILD_MODNAME,				\
+	__PASTE(__,						\
+	__PASTE(__COUNTER__,					\
+	__PASTE(_,						\
+	__PASTE(__LINE__,					\
+	__PASTE(_, fn))))))
+
+/* Format: __<prefix>__<iid><id> */
+#define __initcall_name(prefix, __iid, id)			\
+	__PASTE(__,						\
+	__PASTE(prefix,						\
+	__PASTE(__,						\
+	__PASTE(__iid, id))))
+
+#ifdef CONFIG_LTO_CLANG
+/*
+ * With LTO, the compiler doesn't necessarily obey link order for
+ * initcalls. In order to preserve the correct order, we add each
+ * variable into its own section and generate a linker script (in
+ * scripts/link-vmlinux.sh) to specify the order of the sections.
+ */
+#define __initcall_section(__sec, __iid)			\
+	#__sec ".init.." #__iid
+#else
+#define __initcall_section(__sec, __iid)			\
+	#__sec ".init"
+#endif
+
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define ___define_initcall(fn, id, __sec)			\
+#define ____define_initcall(fn, __name, __sec)			\
 	__ADDRESSABLE(fn)					\
-	asm(".section	\"" #__sec ".init\", \"a\"	\n"	\
-	"__initcall_" #fn #id ":			\n"	\
+	asm(".section	\"" __sec "\", \"a\"		\n"	\
+	    __stringify(__name) ":			\n"	\
 	    ".long	" #fn " - .			\n"	\
 	    ".previous					\n");
 #else
-#define ___define_initcall(fn, id, __sec) \
-	static initcall_t __initcall_##fn##id __used \
-		__attribute__((__section__(#__sec ".init"))) = fn;
+#define ____define_initcall(fn, __name, __sec)			\
+	static initcall_t __name __used 			\
+		__attribute__((__section__(__sec))) = fn;
 #endif
 
+#define __unique_initcall(fn, id, __sec, __iid)			\
+	____define_initcall(fn,					\
+		__initcall_name(initcall, __iid, id),		\
+		__initcall_section(__sec, __iid))
+
+#define ___define_initcall(fn, id, __sec)			\
+	__unique_initcall(fn, id, __sec, __initcall_id(fn))
+
 #define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
 
 /*
@@ -236,7 +274,7 @@ extern bool initcall_debug;
 #define __exitcall(fn)						\
 	static exitcall_t __exitcall_##fn __exit_call = fn
 
-#define console_initcall(fn)	___define_initcall(fn,, .con_initcall)
+#define console_initcall(fn)	___define_initcall(fn, con, .con_initcall)
 
 struct obs_kernel_param {
 	const char *str;
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 213677a5ed33..3ff3dbb3a830 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -117,9 +117,11 @@ target-stem = $(basename $(patsubst $(obj)/%,%,$@))
 # These flags are needed for modversions and compiling, so we define them here
 # $(modname_flags) defines KBUILD_MODNAME as the name of the module it will
 # end up in (or would, if it gets compiled in)
-name-fix = $(call stringify,$(subst $(comma),_,$(subst -,_,$1)))
+name-fix-token = $(subst $(comma),_,$(subst -,_,$1))
+name-fix = $(call stringify,$(call name-fix-token,$1))
 basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
-modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname))
+modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname)) \
+		 -D__KBUILD_MODNAME=kmod_$(call name-fix-token,$(modname))
 modfile_flags  = -DKBUILD_MODFILE=$(call stringify,$(modfile))
 
 _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \
diff --git a/scripts/generate_initcall_order.pl b/scripts/generate_initcall_order.pl
new file mode 100755
index 000000000000..1a88d3f1b913
--- /dev/null
+++ b/scripts/generate_initcall_order.pl
@@ -0,0 +1,270 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generates a linker script that specifies the correct initcall order.
+#
+# Copyright (C) 2019 Google LLC
+
+use strict;
+use warnings;
+use IO::Handle;
+use IO::Select;
+use POSIX ":sys_wait_h";
+
+my $nm = $ENV{'NM'} || die "$0: ERROR: NM not set?";
+my $objtree = $ENV{'objtree'} || '.';
+
+## currently active child processes
+my $jobs = {};		# child process pid -> file handle
+## results from child processes
+my $results = {};	# object index -> [ { level, secname }, ... ]
+
+## reads _NPROCESSORS_ONLN to determine the maximum number of processes to
+## start
+sub get_online_processors {
+	open(my $fh, "getconf _NPROCESSORS_ONLN 2>/dev/null |")
+		or die "$0: ERROR: failed to execute getconf: $!";
+	my $procs = <$fh>;
+	close($fh);
+
+	if (!($procs =~ /^\d+$/)) {
+		return 1;
+	}
+
+	return int($procs);
+}
+
+## writes results to the parent process
+## format: <file index> <initcall level> <base initcall section name>
+sub write_results {
+	my ($index, $initcalls) = @_;
+
+	# sort by the counter value to ensure the order of initcalls within
+	# each object file is correct
+	foreach my $counter (sort { $a <=> $b } keys(%{$initcalls})) {
+		my $level = $initcalls->{$counter}->{'level'};
+
+		# section name for the initcall function
+		my $secname = $initcalls->{$counter}->{'module'} . '__' .
+			      $counter . '_' .
+			      $initcalls->{$counter}->{'line'} . '_' .
+			      $initcalls->{$counter}->{'function'};
+
+		print "$index $level $secname\n";
+	}
+}
+
+## reads a result line from a child process and adds it to the $results array
+sub read_results{
+	my ($fh) = @_;
+
+	# each child prints out a full line w/ autoflush and exits after the
+	# last line, so even if buffered I/O blocks here, it shouldn't block
+	# very long
+	my $data = <$fh>;
+
+	if (!defined($data)) {
+		return 0;
+	}
+
+	chomp($data);
+
+	my ($index, $level, $secname) = $data =~
+		/^(\d+)\ ([^\ ]+)\ (.*)$/;
+
+	if (!defined($index) ||
+		!defined($level) ||
+		!defined($secname)) {
+		die "$0: ERROR: child process returned invalid data: $data\n";
+	}
+
+	$index = int($index);
+
+	if (!exists($results->{$index})) {
+		$results->{$index} = [];
+	}
+
+	push (@{$results->{$index}}, {
+		'level'   => $level,
+		'secname' => $secname
+	});
+
+	return 1;
+}
+
+## finds initcalls from an object file or all object files in an archive, and
+## writes results back to the parent process
+sub find_initcalls {
+	my ($index, $file) = @_;
+
+	die "$0: ERROR: file $file doesn't exist?" if (! -f $file);
+
+	open(my $fh, "\"$nm\" --defined-only \"$file\" 2>/dev/null |")
+		or die "$0: ERROR: failed to execute \"$nm\": $!";
+
+	my $initcalls = {};
+
+	while (<$fh>) {
+		chomp;
+
+		# check for the start of a new object file (if processing an
+		# archive)
+		my ($path)= $_ =~ /^(.+)\:$/;
+
+		if (defined($path)) {
+			write_results($index, $initcalls);
+			$initcalls = {};
+			next;
+		}
+
+		# look for an initcall
+		my ($module, $counter, $line, $symbol) = $_ =~
+			/[a-z]\s+__initcall__(\S*)__(\d+)_(\d+)_(.*)$/;
+
+		if (!defined($module)) {
+			$module = ''
+		}
+
+		if (!defined($counter) ||
+			!defined($line) ||
+			!defined($symbol)) {
+			next;
+		}
+
+		# parse initcall level
+		my ($function, $level) = $symbol =~
+			/^(.*)((early|rootfs|con|[0-9])s?)$/;
+
+		die "$0: ERROR: invalid initcall name $symbol in $file($path)"
+			if (!defined($function) || !defined($level));
+
+		$initcalls->{$counter} = {
+			'module'   => $module,
+			'line'     => $line,
+			'function' => $function,
+			'level'    => $level,
+		};
+	}
+
+	close($fh);
+	write_results($index, $initcalls);
+}
+
+## waits for any child process to complete, reads the results, and adds them to
+## the $results array for later processing
+sub wait_for_results {
+	my ($select) = @_;
+
+	my $pid = 0;
+	do {
+		# unblock children that may have a full write buffer
+		foreach my $fh ($select->can_read(0)) {
+			read_results($fh);
+		}
+
+		# check for children that have exited, read the remaining data
+		# from them, and clean up
+		$pid = waitpid(-1, WNOHANG);
+		if ($pid > 0) {
+			if (!exists($jobs->{$pid})) {
+				next;
+			}
+
+			my $fh = $jobs->{$pid};
+			$select->remove($fh);
+
+			while (read_results($fh)) {
+				# until eof
+			}
+
+			close($fh);
+			delete($jobs->{$pid});
+		}
+	} while ($pid > 0);
+}
+
+## forks a child to process each file passed in the command line and collects
+## the results
+sub process_files {
+	my $index = 0;
+	my $njobs = $ENV{'PARALLELISM'} || get_online_processors();
+	my $select = IO::Select->new();
+
+	while (my $file = shift(@ARGV)) {
+		# fork a child process and read it's stdout
+		my $pid = open(my $fh, '-|');
+
+		if (!defined($pid)) {
+			die "$0: ERROR: failed to fork: $!";
+		} elsif ($pid) {
+			# save the child process pid and the file handle
+			$select->add($fh);
+			$jobs->{$pid} = $fh;
+		} else {
+			# in the child process
+			STDOUT->autoflush(1);
+			find_initcalls($index, "$objtree/$file");
+			exit;
+		}
+
+		$index++;
+
+		# limit the number of children to $njobs
+		if (scalar(keys(%{$jobs})) >= $njobs) {
+			wait_for_results($select);
+		}
+	}
+
+	# wait for the remaining children to complete
+	while (scalar(keys(%{$jobs})) > 0) {
+		wait_for_results($select);
+	}
+}
+
+sub generate_initcall_lds() {
+	process_files();
+
+	my $sections = {};	# level -> [ secname, ...]
+
+	# sort results to retain link order and split to sections per
+	# initcall level
+	foreach my $index (sort { $a <=> $b } keys(%{$results})) {
+		foreach my $result (@{$results->{$index}}) {
+			my $level = $result->{'level'};
+
+			if (!exists($sections->{$level})) {
+				$sections->{$level} = [];
+			}
+
+			push(@{$sections->{$level}}, $result->{'secname'});
+		}
+	}
+
+	die "$0: ERROR: no initcalls?" if (!keys(%{$sections}));
+
+	# print out a linker script that defines the order of initcalls for
+	# each level
+	print "SECTIONS {\n";
+
+	foreach my $level (sort(keys(%{$sections}))) {
+		my $section;
+
+		if ($level eq 'con') {
+			$section = '.con_initcall.init';
+		} else {
+			$section = ".initcall${level}.init";
+		}
+
+		print "\t${section} : {\n";
+
+		foreach my $secname (@{$sections->{$level}}) {
+			print "\t\t*(${section}..${secname}) ;\n";
+		}
+
+		print "\t}\n";
+	}
+
+	print "}\n";
+}
+
+generate_initcall_lds();
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 78e55fe7210b..c5919d5a0b4f 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -43,6 +43,17 @@ info()
 	fi
 }
 
+# Generate a linker script to ensure correct ordering of initcalls.
+gen_initcalls()
+{
+	info GEN .tmp_initcalls.lds
+
+	${PYTHON} ${srctree}/scripts/jobserver-exec		\
+	${PERL} ${srctree}/scripts/generate_initcall_order.pl	\
+		${KBUILD_VMLINUX_OBJS} ${KBUILD_VMLINUX_LIBS}	\
+		> .tmp_initcalls.lds
+}
+
 # If CONFIG_LTO_CLANG is selected, collect generated symbol versions into
 # .tmp_symversions.lds
 gen_symversions()
@@ -72,6 +83,9 @@ modpost_link()
 		--end-group"
 
 	if [ -n "${CONFIG_LTO_CLANG}" ]; then
+		gen_initcalls
+		lds="-T .tmp_initcalls.lds"
+
 		if [ -n "${CONFIG_MODVERSIONS}" ]; then
 			gen_symversions
 			lds="${lds} -T .tmp_symversions.lds"
@@ -262,6 +276,7 @@ cleanup()
 {
 	rm -f .btf.*
 	rm -f .tmp_System.map
+	rm -f .tmp_initcalls.lds
 	rm -f .tmp_symversions.lds
 	rm -f .tmp_vmlinux*
 	rm -f System.map
-- 
cgit v1.2.3


From 3578ad11f3fba07e64c26d8db68cfd3dde28c59e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:25 -0800
Subject: init: lto: fix PREL32 relocations

With LTO, the compiler can rename static functions to avoid global
naming collisions. As initcall functions are typically static,
renaming can break references to them in inline assembly. This
change adds a global stub with a stable name for each initcall to
fix the issue when PREL32 relocations are used.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-9-samitolvanen@google.com
---
 include/linux/init.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/init.h b/include/linux/init.h
index a57bf0dfd75f..a01f01c1a5c5 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -209,26 +209,49 @@ extern bool initcall_debug;
  */
 #define __initcall_section(__sec, __iid)			\
 	#__sec ".init.." #__iid
+
+/*
+ * With LTO, the compiler can rename static functions to avoid
+ * global naming collisions. We use a global stub function for
+ * initcalls to create a stable symbol name whose address can be
+ * taken in inline assembly when PREL32 relocations are used.
+ */
+#define __initcall_stub(fn, __iid, id)				\
+	__initcall_name(initstub, __iid, id)
+
+#define __define_initcall_stub(__stub, fn)			\
+	int __init __stub(void);				\
+	int __init __stub(void)					\
+	{ 							\
+		return fn();					\
+	}							\
+	__ADDRESSABLE(__stub)
 #else
 #define __initcall_section(__sec, __iid)			\
 	#__sec ".init"
+
+#define __initcall_stub(fn, __iid, id)	fn
+
+#define __define_initcall_stub(__stub, fn)			\
+	__ADDRESSABLE(fn)
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define ____define_initcall(fn, __name, __sec)			\
-	__ADDRESSABLE(fn)					\
+#define ____define_initcall(fn, __stub, __name, __sec)		\
+	__define_initcall_stub(__stub, fn)			\
 	asm(".section	\"" __sec "\", \"a\"		\n"	\
 	    __stringify(__name) ":			\n"	\
-	    ".long	" #fn " - .			\n"	\
+	    ".long	" __stringify(__stub) " - .	\n"	\
 	    ".previous					\n");
 #else
-#define ____define_initcall(fn, __name, __sec)			\
+#define ____define_initcall(fn, __unused, __name, __sec)	\
 	static initcall_t __name __used 			\
 		__attribute__((__section__(__sec))) = fn;
 #endif
 
 #define __unique_initcall(fn, id, __sec, __iid)			\
 	____define_initcall(fn,					\
+		__initcall_stub(fn, __iid, id),			\
 		__initcall_name(initcall, __iid, id),		\
 		__initcall_section(__sec, __iid))
 
-- 
cgit v1.2.3


From 09a4e4d9c52a3c5e39e4f409b2c083ab13c6afc2 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:26 -0800
Subject: PCI: Fix PREL32 relocations for LTO

With Clang's Link Time Optimization (LTO), the compiler can rename
static functions to avoid global naming collisions. As PCI fixup
functions are typically static, renaming can break references
to them in inline assembly. This change adds a global stub to
DECLARE_PCI_FIXUP_SECTION to fix the issue when PREL32 relocations
are used.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-10-samitolvanen@google.com
---
 include/linux/pci.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index b32126d26997..2276e5d00192 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1917,7 +1917,7 @@ enum pci_fixup_pass {
 };
 
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+#define ___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				    class_shift, hook)			\
 	__ADDRESSABLE(hook)						\
 	asm(".section "	#sec ", \"a\"				\n"	\
@@ -1926,10 +1926,33 @@ enum pci_fixup_pass {
 	    ".long "	#class ", " #class_shift "		\n"	\
 	    ".long "	#hook " - .				\n"	\
 	    ".previous						\n");
+
+/*
+ * Clang's LTO may rename static functions in C, but has no way to
+ * handle such renamings when referenced from inline asm. To work
+ * around this, create global C stubs for these cases.
+ */
+#ifdef CONFIG_LTO_CLANG
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook, stub)		\
+	void stub(struct pci_dev *dev);					\
+	void stub(struct pci_dev *dev)					\
+	{ 								\
+		hook(dev); 						\
+	}								\
+	___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, stub)
+#else
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook, stub)		\
+	___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook)
+#endif
+
 #define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				  class_shift, hook)			\
 	__DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
-				  class_shift, hook)
+				  class_shift, hook, __UNIQUE_ID(hook))
 #else
 /* Anonymous variables would be nice... */
 #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class,	\
-- 
cgit v1.2.3


From 802fee26d8afd073c630a74dbe1a996970f3fd90 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Sun, 13 Dec 2020 22:50:40 +0900
Subject: riscv: cleanup Canaan Kendryte K210 sysctl driver

Introduce the header file include/soc/canaan/k210-sysctl.h to have a
common definition of the Canaan Kendryte K210 SoC system controller
registers. Simplify the k210 system controller driver code by removing
unused register bits definition. The MAINTAINERS file is updated,
adding the entry "CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER"
with myself listed as maintainer for this driver.
This is a preparatory patch for introducing the K210 clock driver. No
functional changes are introduced.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 MAINTAINERS                      |  7 +++
 drivers/soc/canaan/k210-sysctl.c | 93 +++++++++++-----------------------------
 include/soc/canaan/k210-sysctl.h | 41 ++++++++++++++++++
 3 files changed, 74 insertions(+), 67 deletions(-)
 create mode 100644 include/soc/canaan/k210-sysctl.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6eff4f720c72..4aaa7a17016b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3860,6 +3860,13 @@ W:	https://github.com/Cascoda/ca8210-linux.git
 F:	Documentation/devicetree/bindings/net/ieee802154/ca8210.txt
 F:	drivers/net/ieee802154/ca8210.c
 
+CANAAN/KENDRYTE K210 SOC SYSTEM CONTROLLER DRIVER
+M:	Damien Le Moal <damien.lemoal@wdc.com>
+L:	linux-riscv@lists.infradead.org
+S:	Maintained
+F:	drivers/soc/canaan/
+F:	include/soc/canaan/
+
 CACHEFILES: FS-CACHE BACKEND FOR CACHING ON MOUNTED FILESYSTEMS
 M:	David Howells <dhowells@redhat.com>
 L:	linux-cachefs@redhat.com (moderated for non-subscribers)
diff --git a/drivers/soc/canaan/k210-sysctl.c b/drivers/soc/canaan/k210-sysctl.c
index 4608fbca20e1..60b474c33d45 100644
--- a/drivers/soc/canaan/k210-sysctl.c
+++ b/drivers/soc/canaan/k210-sysctl.c
@@ -12,74 +12,33 @@
 #include <linux/bitfield.h>
 #include <asm/soc.h>
 
+#include <soc/canaan/k210-sysctl.h>
+
 #define K210_SYSCTL_CLK0_FREQ		26000000UL
 
 /* Registers base address */
 #define K210_SYSCTL_SYSCTL_BASE_ADDR	0x50440000ULL
 
-/* Registers */
-#define K210_SYSCTL_PLL0		0x08
-#define K210_SYSCTL_PLL1		0x0c
-/* clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */
-#define   PLL_RESET		(1 << 20)
-#define   PLL_PWR		(1 << 21)
-#define   PLL_INTFB		(1 << 22)
-#define   PLL_BYPASS		(1 << 23)
-#define   PLL_TEST		(1 << 24)
-#define   PLL_OUT_EN		(1 << 25)
-#define   PLL_TEST_EN		(1 << 26)
-#define K210_SYSCTL_PLL_LOCK		0x18
-#define   PLL0_LOCK1		(1 << 0)
-#define   PLL0_LOCK2		(1 << 1)
-#define   PLL0_SLIP_CLEAR	(1 << 2)
-#define   PLL0_TEST_CLK_OUT	(1 << 3)
-#define   PLL1_LOCK1		(1 << 8)
-#define   PLL1_LOCK2		(1 << 9)
-#define   PLL1_SLIP_CLEAR	(1 << 10)
-#define   PLL1_TEST_CLK_OUT	(1 << 11)
-#define   PLL2_LOCK1		(1 << 16)
-#define   PLL2_LOCK2		(1 << 16)
-#define   PLL2_SLIP_CLEAR	(1 << 18)
-#define   PLL2_TEST_CLK_OUT	(1 << 19)
-#define K210_SYSCTL_CLKSEL0	0x20
-#define   CLKSEL_ACLK		(1 << 0)
-#define K210_SYSCTL_CLKEN_CENT		0x28
-#define   CLKEN_CPU		(1 << 0)
-#define   CLKEN_SRAM0		(1 << 1)
-#define   CLKEN_SRAM1		(1 << 2)
-#define   CLKEN_APB0		(1 << 3)
-#define   CLKEN_APB1		(1 << 4)
-#define   CLKEN_APB2		(1 << 5)
-#define K210_SYSCTL_CLKEN_PERI		0x2c
-#define   CLKEN_ROM		(1 << 0)
-#define   CLKEN_DMA		(1 << 1)
-#define   CLKEN_AI		(1 << 2)
-#define   CLKEN_DVP		(1 << 3)
-#define   CLKEN_FFT		(1 << 4)
-#define   CLKEN_GPIO		(1 << 5)
-#define   CLKEN_SPI0		(1 << 6)
-#define   CLKEN_SPI1		(1 << 7)
-#define   CLKEN_SPI2		(1 << 8)
-#define   CLKEN_SPI3		(1 << 9)
-#define   CLKEN_I2S0		(1 << 10)
-#define   CLKEN_I2S1		(1 << 11)
-#define   CLKEN_I2S2		(1 << 12)
-#define   CLKEN_I2C0		(1 << 13)
-#define   CLKEN_I2C1		(1 << 14)
-#define   CLKEN_I2C2		(1 << 15)
-#define   CLKEN_UART1		(1 << 16)
-#define   CLKEN_UART2		(1 << 17)
-#define   CLKEN_UART3		(1 << 18)
-#define   CLKEN_AES		(1 << 19)
-#define   CLKEN_FPIO		(1 << 20)
-#define   CLKEN_TIMER0		(1 << 21)
-#define   CLKEN_TIMER1		(1 << 22)
-#define   CLKEN_TIMER2		(1 << 23)
-#define   CLKEN_WDT0		(1 << 24)
-#define   CLKEN_WDT1		(1 << 25)
-#define   CLKEN_SHA		(1 << 26)
-#define   CLKEN_OTP		(1 << 27)
-#define   CLKEN_RTC		(1 << 29)
+/* Register bits */
+/* K210_SYSCTL_PLL1: clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */
+#define PLL_RESET		(1 << 20)
+#define PLL_PWR			(1 << 21)
+#define PLL_BYPASS		(1 << 23)
+#define PLL_OUT_EN		(1 << 25)
+/* K210_SYSCTL_PLL_LOCK */
+#define PLL1_LOCK1		(1 << 8)
+#define PLL1_LOCK2		(1 << 9)
+#define PLL1_SLIP_CLEAR		(1 << 10)
+/* K210_SYSCTL_SEL0 */
+#define CLKSEL_ACLK		(1 << 0)
+/* K210_SYSCTL_CLKEN_CENT */
+#define CLKEN_CPU		(1 << 0)
+#define CLKEN_SRAM0		(1 << 1)
+#define CLKEN_SRAM1		(1 << 2)
+/* K210_SYSCTL_EN_PERI */
+#define CLKEN_ROM		(1 << 0)
+#define CLKEN_TIMER0		(1 << 21)
+#define CLKEN_RTC		(1 << 29)
 
 struct k210_sysctl {
 	void __iomem		*regs;
@@ -140,7 +99,7 @@ static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw,
 	 * If the clock selector is not set, use the base frequency.
 	 * Otherwise, use PLL0 frequency with a frequency divisor.
 	 */
-	clksel0 = readl(s->regs + K210_SYSCTL_CLKSEL0);
+	clksel0 = readl(s->regs + K210_SYSCTL_SEL0);
 	if (!(clksel0 & CLKSEL_ACLK))
 		return K210_SYSCTL_CLK0_FREQ;
 
@@ -237,11 +196,11 @@ static void __init k210_soc_early_init(const void *fdt)
 	k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0);
 
 	k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1,
-		      regs + K210_SYSCTL_CLKEN_CENT);
+		      regs + K210_SYSCTL_EN_CENT);
 	k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC,
-		      regs + K210_SYSCTL_CLKEN_PERI);
+		      regs + K210_SYSCTL_EN_PERI);
 
-	k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_CLKSEL0);
+	k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_SEL0);
 
 	iounmap(regs);
 }
diff --git a/include/soc/canaan/k210-sysctl.h b/include/soc/canaan/k210-sysctl.h
new file mode 100644
index 000000000000..2e037db68f35
--- /dev/null
+++ b/include/soc/canaan/k210-sysctl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+#ifndef K210_SYSCTL_H
+#define K210_SYSCTL_H
+
+/*
+ * Kendryte K210 SoC system controller registers offsets.
+ * Taken from Kendryte SDK (kendryte-standalone-sdk).
+ */
+#define K210_SYSCTL_GIT_ID	0x00 /* Git short commit id */
+#define K210_SYSCTL_UART_BAUD	0x04 /* Default UARTHS baud rate */
+#define K210_SYSCTL_PLL0	0x08 /* PLL0 controller */
+#define K210_SYSCTL_PLL1	0x0C /* PLL1 controller */
+#define K210_SYSCTL_PLL2	0x10 /* PLL2 controller */
+#define K210_SYSCTL_PLL_LOCK	0x18 /* PLL lock tester */
+#define K210_SYSCTL_ROM_ERROR	0x1C /* AXI ROM detector */
+#define K210_SYSCTL_SEL0	0x20 /* Clock select controller 0 */
+#define K210_SYSCTL_SEL1	0x24 /* Clock select controller 1 */
+#define K210_SYSCTL_EN_CENT	0x28 /* Central clock enable */
+#define K210_SYSCTL_EN_PERI	0x2C /* Peripheral clock enable */
+#define K210_SYSCTL_SOFT_RESET	0x30 /* Soft reset ctrl */
+#define K210_SYSCTL_PERI_RESET	0x34 /* Peripheral reset controller */
+#define K210_SYSCTL_THR0	0x38 /* Clock threshold controller 0 */
+#define K210_SYSCTL_THR1	0x3C /* Clock threshold controller 1 */
+#define K210_SYSCTL_THR2	0x40 /* Clock threshold controller 2 */
+#define K210_SYSCTL_THR3	0x44 /* Clock threshold controller 3 */
+#define K210_SYSCTL_THR4	0x48 /* Clock threshold controller 4 */
+#define K210_SYSCTL_THR5	0x4C /* Clock threshold controller 5 */
+#define K210_SYSCTL_THR6	0x50 /* Clock threshold controller 6 */
+#define K210_SYSCTL_MISC	0x54 /* Miscellaneous controller */
+#define K210_SYSCTL_PERI	0x58 /* Peripheral controller */
+#define K210_SYSCTL_SPI_SLEEP	0x5C /* SPI sleep controller */
+#define K210_SYSCTL_RESET_STAT	0x60 /* Reset source status */
+#define K210_SYSCTL_DMA_SEL0	0x64 /* DMA handshake selector 0 */
+#define K210_SYSCTL_DMA_SEL1	0x68 /* DMA handshake selector 1 */
+#define K210_SYSCTL_POWER_SEL	0x6C /* IO Power Mode Select controller */
+
+#endif
-- 
cgit v1.2.3


From 1d7c9d093ed58b8cf4ae23986cd01272667d412a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Sun, 13 Dec 2020 22:50:43 +0900
Subject: dt-bindings: reset: Document canaan,k210-rst bindings

Document the device tree bindings for the Canaan Kendryte K210 SoC
reset controller driver in
Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml. The header
file include/dt-bindings/reset/k210-rst.h is added to define all
possible reset lines of the SoC.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 .../devicetree/bindings/reset/canaan,k210-rst.yaml | 40 +++++++++++++++++++++
 include/dt-bindings/reset/k210-rst.h               | 42 ++++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml
 create mode 100644 include/dt-bindings/reset/k210-rst.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml b/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml
new file mode 100644
index 000000000000..53e4ede9c0bd
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/reset/canaan,k210-rst.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Canaan Kendryte K210 Reset Controller Device Tree Bindings
+
+maintainers:
+  - Damien Le Moal <damien.lemoal@wdc.com>
+
+description: |
+  Canaan Kendryte K210 reset controller driver which supports the SoC
+  system controller supplied reset registers for the various peripherals
+  of the SoC. The K210 reset controller node must be defined as a child
+  node of the K210 system controller node.
+
+  See also:
+  - dt-bindings/reset/k210-rst.h
+
+properties:
+  compatible:
+    const: canaan,k210-rst
+
+  '#reset-cells':
+    const: 1
+
+required:
+  - '#reset-cells'
+  - compatible
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/reset/k210-rst.h>
+    sysrst: reset-controller {
+      compatible = "canaan,k210-rst";
+      #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/reset/k210-rst.h b/include/dt-bindings/reset/k210-rst.h
new file mode 100644
index 000000000000..883c1aed50e8
--- /dev/null
+++ b/include/dt-bindings/reset/k210-rst.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2019 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+#ifndef RESET_K210_SYSCTL_H
+#define RESET_K210_SYSCTL_H
+
+/*
+ * Kendryte K210 SoC system controller K210_SYSCTL_SOFT_RESET register bits.
+ * Taken from Kendryte SDK (kendryte-standalone-sdk).
+ */
+#define K210_RST_ROM	0
+#define K210_RST_DMA	1
+#define K210_RST_AI	2
+#define K210_RST_DVP	3
+#define K210_RST_FFT	4
+#define K210_RST_GPIO	5
+#define K210_RST_SPI0	6
+#define K210_RST_SPI1	7
+#define K210_RST_SPI2	8
+#define K210_RST_SPI3	9
+#define K210_RST_I2S0	10
+#define K210_RST_I2S1	11
+#define K210_RST_I2S2	12
+#define K210_RST_I2C0	13
+#define K210_RST_I2C1	14
+#define K210_RST_I2C2	15
+#define K210_RST_UART1	16
+#define K210_RST_UART2	17
+#define K210_RST_UART3	18
+#define K210_RST_AES	19
+#define K210_RST_FPIOA	20
+#define K210_RST_TIMER0	21
+#define K210_RST_TIMER1	22
+#define K210_RST_TIMER2	23
+#define K210_RST_WDT0	24
+#define K210_RST_WDT1	25
+#define K210_RST_SHA	26
+#define K210_RST_RTC	29
+
+#endif /* RESET_K210_SYSCTL_H */
-- 
cgit v1.2.3


From ed3137edb31b86702511e7ad12b4abe8686b6805 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Sun, 13 Dec 2020 22:50:44 +0900
Subject: dt-bindings: pinctrl: Document canaan,k210-fpioa bindings

Document the device tree bindings for the Canaan Kendryte K210 SoC
Fully Programmable IO Array (FPIOA) pinctrl driver in
Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml. The
new header file include/dt-bindings/pinctrl/k210-fpioa.h is added to
define all 256 possible pin functions of the SoC IO pins, as well as
macros simplifying the definition of pin functions in a device tree.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 .../bindings/pinctrl/canaan,k210-fpioa.yaml        | 171 +++++++++++++
 include/dt-bindings/pinctrl/k210-fpioa.h           | 276 +++++++++++++++++++++
 2 files changed, 447 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml
 create mode 100644 include/dt-bindings/pinctrl/k210-fpioa.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml b/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml
new file mode 100644
index 000000000000..46fbc73ab26b
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/canaan,k210-fpioa.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Canaan Kendryte K210 FPIOA Device Tree Bindings
+
+maintainers:
+  - Damien Le Moal <damien.lemoal@wdc.com>
+
+description:
+  The Canaan Kendryte K210 SoC Fully Programmable IO Array (FPIOA)
+  controller allows assiging any of 256 possible functions to any of
+  48 IO pins of the SoC. Pin function configuration is performed on
+  a per-pin basis.
+
+properties:
+  compatible:
+    const: canaan,k210-fpioa
+
+  reg:
+    maxItems: 1
+    description:
+      Address and length of the register set for the FPIOA controller.
+
+  clocks:
+    items:
+      - description: Controller reference clock source
+      - description: APB interface clock source
+
+  clock-names:
+    items:
+      - const: ref
+      - const: pclk
+
+  resets:
+    maxItems: 1
+
+  canaan,k210-sysctl-power:
+    $ref: /schemas/types.yaml#/definitions/phandle-array
+    description: |
+      phandle of the K210 system controller node and offset of its
+      power domain control register.
+
+patternProperties:
+  '-pinmux$':
+    type: object
+    $ref: /schemas/pinctrl/pinmux-node.yaml
+    description:
+      FPIOA client devices use sub-nodes to define the desired pin
+      configuration. Client device sub-nodes use the pinux property
+      below.
+
+    properties:
+      pinmux:
+        description:
+          List of IO pins alternate functions. The values for each IO
+          pin is a combination of an IO pin number (0 to 47) with the
+          desired function for the IO pin. Functions are defined as
+          macros in include/dt-bindings/pinctrl/k210-fpioa.h.
+          The K210_FPIOA(IO pin, function) macro is provided to
+          facilitate the combination of IO pin numbers and functions.
+
+    required:
+      - pinmux
+
+    additionalProperties: false
+
+  '-pins$':
+    type: object
+    $ref: /schemas/pinctrl/pincfg-node.yaml
+    description:
+      FPIOA client devices use sub-nodes to define the desired
+      configuration of pins. Client device sub-nodes use the
+      properties below.
+
+    properties:
+      pins:
+        description:
+          List of IO pins affected by the properties specified in this
+          subnode. IO pins are identified using the pin names "IO_xx".
+          Pin configuration nodes can also define the power domain to
+          be used for the SoC pin groups A0 (IO pins 0-5),
+          A1 (IO pins 6-11), A2 (IO pins 12-17), B0 (IO pins 18-23),
+          B1 (IO pins 24-29), B2 (IO pins 30-35), B3 (IO pins 30-35),
+          C0 (IO pins 36-41) and C1 (IO pins 42-47) using the
+          power-source property.
+        items:
+          anyOf:
+            - pattern: "^(IO_([0-9]*))|(A[0-2])|(B[3-5])|(C[6-7])$"
+            - enum: [ IO_0, IO_1, IO_2, IO_3, IO_4, IO_5, IO_6, IO_7,
+                      IO_8, IO_9, IO_10, IO_11, IO_12, IO_13, IO_14,
+                      IO_15, IO_16, IO_17, IO_18, IO_19, IO_20, IO_21,
+                      IO_22, IO_23, IO_24, IO_25, IO_26, IO_27, IO_28,
+                      IO_29, IO_30, IO_31, IO_32, IO_33, IO_34, IO_35,
+                      IO_36, IO_37, IO_38, IO_39, IO_40, IO_41, IO_42,
+                      IO_43, IO_44, IO_45, IO_46, IO_47,
+                      A0, A1, A2, B3, B4, B5, C6, C7 ]
+      bias-disable: true
+
+      bias-pull-down: true
+
+      bias-pull-up: true
+
+      drive-strength: true
+
+      drive-strength-microamp: true
+
+      input-enable: true
+
+      input-disable: true
+
+      input-schmitt-enable: true
+
+      input-schmitt-disable: true
+
+      input-polarity-invert:
+        description:
+          Enable or disable pin input polarity inversion.
+
+      output-enable: true
+
+      output-disable: true
+
+      output-high: true
+
+      output-low: true
+
+      output-polarity-invert:
+        description:
+          Enable or disable pin output polarity inversion.
+
+      slew-rate: true
+
+      power-source: true
+
+    additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - canaan,k210-sysctl-power
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/pinctrl/k210-fpioa.h>
+    #include <dt-bindings/clock/k210-clk.h>
+    #include <dt-bindings/reset/k210-rst.h>
+
+    fpioa: pinmux@502B0000 {
+      compatible = "canaan,k210-fpioa";
+      reg = <0x502B0000 0x100>;
+      clocks = <&sysclk K210_CLK_FPIOA>,
+               <&sysclk K210_CLK_APB0>;
+      clock-names = "ref", "pclk";
+      resets = <&sysrst K210_RST_FPIOA>;
+      canaan,k210-sysctl-power = <&sysctl 108>;
+      pinctrl-0 = <&jtag_pinctrl>;
+      pinctrl-names = "default";
+
+      jtag_pinctrl: jtag-pinmux {
+        pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>,
+                 <K210_FPIOA(1, K210_PCF_JTAG_TDI)>,
+                 <K210_FPIOA(2, K210_PCF_JTAG_TMS)>,
+                 <K210_FPIOA(3, K210_PCF_JTAG_TDO)>;
+      };
+    };
diff --git a/include/dt-bindings/pinctrl/k210-fpioa.h b/include/dt-bindings/pinctrl/k210-fpioa.h
new file mode 100644
index 000000000000..314285eab3a1
--- /dev/null
+++ b/include/dt-bindings/pinctrl/k210-fpioa.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2020 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+#ifndef PINCTRL_K210_FPIOA_H
+#define PINCTRL_K210_FPIOA_H
+
+/*
+ * Full list of FPIOA functions from
+ * kendryte-standalone-sdk/lib/drivers/include/fpioa.h
+ */
+#define K210_PCF_MASK		GENMASK(7, 0)
+#define K210_PCF_JTAG_TCLK	0   /* JTAG Test Clock */
+#define K210_PCF_JTAG_TDI	1   /* JTAG Test Data In */
+#define K210_PCF_JTAG_TMS	2   /* JTAG Test Mode Select */
+#define K210_PCF_JTAG_TDO	3   /* JTAG Test Data Out */
+#define K210_PCF_SPI0_D0	4   /* SPI0 Data 0 */
+#define K210_PCF_SPI0_D1	5   /* SPI0 Data 1 */
+#define K210_PCF_SPI0_D2	6   /* SPI0 Data 2 */
+#define K210_PCF_SPI0_D3	7   /* SPI0 Data 3 */
+#define K210_PCF_SPI0_D4	8   /* SPI0 Data 4 */
+#define K210_PCF_SPI0_D5	9   /* SPI0 Data 5 */
+#define K210_PCF_SPI0_D6	10  /* SPI0 Data 6 */
+#define K210_PCF_SPI0_D7	11  /* SPI0 Data 7 */
+#define K210_PCF_SPI0_SS0	12  /* SPI0 Chip Select 0 */
+#define K210_PCF_SPI0_SS1	13  /* SPI0 Chip Select 1 */
+#define K210_PCF_SPI0_SS2	14  /* SPI0 Chip Select 2 */
+#define K210_PCF_SPI0_SS3	15  /* SPI0 Chip Select 3 */
+#define K210_PCF_SPI0_ARB	16  /* SPI0 Arbitration */
+#define K210_PCF_SPI0_SCLK	17  /* SPI0 Serial Clock */
+#define K210_PCF_UARTHS_RX	18  /* UART High speed Receiver */
+#define K210_PCF_UARTHS_TX	19  /* UART High speed Transmitter */
+#define K210_PCF_RESV6		20  /* Reserved function */
+#define K210_PCF_RESV7		21  /* Reserved function */
+#define K210_PCF_CLK_SPI1	22  /* Clock SPI1 */
+#define K210_PCF_CLK_I2C1	23  /* Clock I2C1 */
+#define K210_PCF_GPIOHS0	24  /* GPIO High speed 0 */
+#define K210_PCF_GPIOHS1	25  /* GPIO High speed 1 */
+#define K210_PCF_GPIOHS2	26  /* GPIO High speed 2 */
+#define K210_PCF_GPIOHS3	27  /* GPIO High speed 3 */
+#define K210_PCF_GPIOHS4	28  /* GPIO High speed 4 */
+#define K210_PCF_GPIOHS5	29  /* GPIO High speed 5 */
+#define K210_PCF_GPIOHS6	30  /* GPIO High speed 6 */
+#define K210_PCF_GPIOHS7	31  /* GPIO High speed 7 */
+#define K210_PCF_GPIOHS8	32  /* GPIO High speed 8 */
+#define K210_PCF_GPIOHS9	33  /* GPIO High speed 9 */
+#define K210_PCF_GPIOHS10	34  /* GPIO High speed 10 */
+#define K210_PCF_GPIOHS11	35  /* GPIO High speed 11 */
+#define K210_PCF_GPIOHS12	36  /* GPIO High speed 12 */
+#define K210_PCF_GPIOHS13	37  /* GPIO High speed 13 */
+#define K210_PCF_GPIOHS14	38  /* GPIO High speed 14 */
+#define K210_PCF_GPIOHS15	39  /* GPIO High speed 15 */
+#define K210_PCF_GPIOHS16	40  /* GPIO High speed 16 */
+#define K210_PCF_GPIOHS17	41  /* GPIO High speed 17 */
+#define K210_PCF_GPIOHS18	42  /* GPIO High speed 18 */
+#define K210_PCF_GPIOHS19	43  /* GPIO High speed 19 */
+#define K210_PCF_GPIOHS20	44  /* GPIO High speed 20 */
+#define K210_PCF_GPIOHS21	45  /* GPIO High speed 21 */
+#define K210_PCF_GPIOHS22	46  /* GPIO High speed 22 */
+#define K210_PCF_GPIOHS23	47  /* GPIO High speed 23 */
+#define K210_PCF_GPIOHS24	48  /* GPIO High speed 24 */
+#define K210_PCF_GPIOHS25	49  /* GPIO High speed 25 */
+#define K210_PCF_GPIOHS26	50  /* GPIO High speed 26 */
+#define K210_PCF_GPIOHS27	51  /* GPIO High speed 27 */
+#define K210_PCF_GPIOHS28	52  /* GPIO High speed 28 */
+#define K210_PCF_GPIOHS29	53  /* GPIO High speed 29 */
+#define K210_PCF_GPIOHS30	54  /* GPIO High speed 30 */
+#define K210_PCF_GPIOHS31	55  /* GPIO High speed 31 */
+#define K210_PCF_GPIO0		56  /* GPIO pin 0 */
+#define K210_PCF_GPIO1		57  /* GPIO pin 1 */
+#define K210_PCF_GPIO2		58  /* GPIO pin 2 */
+#define K210_PCF_GPIO3		59  /* GPIO pin 3 */
+#define K210_PCF_GPIO4		60  /* GPIO pin 4 */
+#define K210_PCF_GPIO5		61  /* GPIO pin 5 */
+#define K210_PCF_GPIO6		62  /* GPIO pin 6 */
+#define K210_PCF_GPIO7		63  /* GPIO pin 7 */
+#define K210_PCF_UART1_RX	64  /* UART1 Receiver */
+#define K210_PCF_UART1_TX	65  /* UART1 Transmitter */
+#define K210_PCF_UART2_RX	66  /* UART2 Receiver */
+#define K210_PCF_UART2_TX	67  /* UART2 Transmitter */
+#define K210_PCF_UART3_RX	68  /* UART3 Receiver */
+#define K210_PCF_UART3_TX	69  /* UART3 Transmitter */
+#define K210_PCF_SPI1_D0	70  /* SPI1 Data 0 */
+#define K210_PCF_SPI1_D1	71  /* SPI1 Data 1 */
+#define K210_PCF_SPI1_D2	72  /* SPI1 Data 2 */
+#define K210_PCF_SPI1_D3	73  /* SPI1 Data 3 */
+#define K210_PCF_SPI1_D4	74  /* SPI1 Data 4 */
+#define K210_PCF_SPI1_D5	75  /* SPI1 Data 5 */
+#define K210_PCF_SPI1_D6	76  /* SPI1 Data 6 */
+#define K210_PCF_SPI1_D7	77  /* SPI1 Data 7 */
+#define K210_PCF_SPI1_SS0	78  /* SPI1 Chip Select 0 */
+#define K210_PCF_SPI1_SS1	79  /* SPI1 Chip Select 1 */
+#define K210_PCF_SPI1_SS2	80  /* SPI1 Chip Select 2 */
+#define K210_PCF_SPI1_SS3	81  /* SPI1 Chip Select 3 */
+#define K210_PCF_SPI1_ARB	82  /* SPI1 Arbitration */
+#define K210_PCF_SPI1_SCLK	83  /* SPI1 Serial Clock */
+#define K210_PCF_SPI2_D0	84  /* SPI2 Data 0 */
+#define K210_PCF_SPI2_SS	85  /* SPI2 Select */
+#define K210_PCF_SPI2_SCLK	86  /* SPI2 Serial Clock */
+#define K210_PCF_I2S0_MCLK	87  /* I2S0 Master Clock */
+#define K210_PCF_I2S0_SCLK	88  /* I2S0 Serial Clock(BCLK) */
+#define K210_PCF_I2S0_WS	89  /* I2S0 Word Select(LRCLK) */
+#define K210_PCF_I2S0_IN_D0	90  /* I2S0 Serial Data Input 0 */
+#define K210_PCF_I2S0_IN_D1	91  /* I2S0 Serial Data Input 1 */
+#define K210_PCF_I2S0_IN_D2	92  /* I2S0 Serial Data Input 2 */
+#define K210_PCF_I2S0_IN_D3	93  /* I2S0 Serial Data Input 3 */
+#define K210_PCF_I2S0_OUT_D0	94  /* I2S0 Serial Data Output 0 */
+#define K210_PCF_I2S0_OUT_D1	95  /* I2S0 Serial Data Output 1 */
+#define K210_PCF_I2S0_OUT_D2	96  /* I2S0 Serial Data Output 2 */
+#define K210_PCF_I2S0_OUT_D3	97  /* I2S0 Serial Data Output 3 */
+#define K210_PCF_I2S1_MCLK	98  /* I2S1 Master Clock */
+#define K210_PCF_I2S1_SCLK	99  /* I2S1 Serial Clock(BCLK) */
+#define K210_PCF_I2S1_WS	100 /* I2S1 Word Select(LRCLK) */
+#define K210_PCF_I2S1_IN_D0	101 /* I2S1 Serial Data Input 0 */
+#define K210_PCF_I2S1_IN_D1	102 /* I2S1 Serial Data Input 1 */
+#define K210_PCF_I2S1_IN_D2	103 /* I2S1 Serial Data Input 2 */
+#define K210_PCF_I2S1_IN_D3	104 /* I2S1 Serial Data Input 3 */
+#define K210_PCF_I2S1_OUT_D0	105 /* I2S1 Serial Data Output 0 */
+#define K210_PCF_I2S1_OUT_D1	106 /* I2S1 Serial Data Output 1 */
+#define K210_PCF_I2S1_OUT_D2	107 /* I2S1 Serial Data Output 2 */
+#define K210_PCF_I2S1_OUT_D3	108 /* I2S1 Serial Data Output 3 */
+#define K210_PCF_I2S2_MCLK	109 /* I2S2 Master Clock */
+#define K210_PCF_I2S2_SCLK	110 /* I2S2 Serial Clock(BCLK) */
+#define K210_PCF_I2S2_WS	111 /* I2S2 Word Select(LRCLK) */
+#define K210_PCF_I2S2_IN_D0	112 /* I2S2 Serial Data Input 0 */
+#define K210_PCF_I2S2_IN_D1	113 /* I2S2 Serial Data Input 1 */
+#define K210_PCF_I2S2_IN_D2	114 /* I2S2 Serial Data Input 2 */
+#define K210_PCF_I2S2_IN_D3	115 /* I2S2 Serial Data Input 3 */
+#define K210_PCF_I2S2_OUT_D0	116 /* I2S2 Serial Data Output 0 */
+#define K210_PCF_I2S2_OUT_D1	117 /* I2S2 Serial Data Output 1 */
+#define K210_PCF_I2S2_OUT_D2	118 /* I2S2 Serial Data Output 2 */
+#define K210_PCF_I2S2_OUT_D3	119 /* I2S2 Serial Data Output 3 */
+#define K210_PCF_RESV0		120 /* Reserved function */
+#define K210_PCF_RESV1		121 /* Reserved function */
+#define K210_PCF_RESV2		122 /* Reserved function */
+#define K210_PCF_RESV3		123 /* Reserved function */
+#define K210_PCF_RESV4		124 /* Reserved function */
+#define K210_PCF_RESV5		125 /* Reserved function */
+#define K210_PCF_I2C0_SCLK	126 /* I2C0 Serial Clock */
+#define K210_PCF_I2C0_SDA	127 /* I2C0 Serial Data */
+#define K210_PCF_I2C1_SCLK	128 /* I2C1 Serial Clock */
+#define K210_PCF_I2C1_SDA	129 /* I2C1 Serial Data */
+#define K210_PCF_I2C2_SCLK	130 /* I2C2 Serial Clock */
+#define K210_PCF_I2C2_SDA	131 /* I2C2 Serial Data */
+#define K210_PCF_DVP_XCLK	132 /* DVP System Clock */
+#define K210_PCF_DVP_RST	133 /* DVP System Reset */
+#define K210_PCF_DVP_PWDN	134 /* DVP Power Down Mode */
+#define K210_PCF_DVP_VSYNC	135 /* DVP Vertical Sync */
+#define K210_PCF_DVP_HSYNC	136 /* DVP Horizontal Sync */
+#define K210_PCF_DVP_PCLK	137 /* Pixel Clock */
+#define K210_PCF_DVP_D0		138 /* Data Bit 0 */
+#define K210_PCF_DVP_D1		139 /* Data Bit 1 */
+#define K210_PCF_DVP_D2		140 /* Data Bit 2 */
+#define K210_PCF_DVP_D3		141 /* Data Bit 3 */
+#define K210_PCF_DVP_D4		142 /* Data Bit 4 */
+#define K210_PCF_DVP_D5		143 /* Data Bit 5 */
+#define K210_PCF_DVP_D6		144 /* Data Bit 6 */
+#define K210_PCF_DVP_D7		145 /* Data Bit 7 */
+#define K210_PCF_SCCB_SCLK	146 /* Serial Camera Control Bus Clock */
+#define K210_PCF_SCCB_SDA	147 /* Serial Camera Control Bus Data */
+#define K210_PCF_UART1_CTS	148 /* UART1 Clear To Send */
+#define K210_PCF_UART1_DSR	149 /* UART1 Data Set Ready */
+#define K210_PCF_UART1_DCD	150 /* UART1 Data Carrier Detect */
+#define K210_PCF_UART1_RI	151 /* UART1 Ring Indicator */
+#define K210_PCF_UART1_SIR_IN	152 /* UART1 Serial Infrared Input */
+#define K210_PCF_UART1_DTR	153 /* UART1 Data Terminal Ready */
+#define K210_PCF_UART1_RTS	154 /* UART1 Request To Send */
+#define K210_PCF_UART1_OUT2	155 /* UART1 User-designated Output 2 */
+#define K210_PCF_UART1_OUT1	156 /* UART1 User-designated Output 1 */
+#define K210_PCF_UART1_SIR_OUT	157 /* UART1 Serial Infrared Output */
+#define K210_PCF_UART1_BAUD	158 /* UART1 Transmit Clock Output */
+#define K210_PCF_UART1_RE	159 /* UART1 Receiver Output Enable */
+#define K210_PCF_UART1_DE	160 /* UART1 Driver Output Enable */
+#define K210_PCF_UART1_RS485_EN	161 /* UART1 RS485 Enable */
+#define K210_PCF_UART2_CTS	162 /* UART2 Clear To Send */
+#define K210_PCF_UART2_DSR	163 /* UART2 Data Set Ready */
+#define K210_PCF_UART2_DCD	164 /* UART2 Data Carrier Detect */
+#define K210_PCF_UART2_RI	165 /* UART2 Ring Indicator */
+#define K210_PCF_UART2_SIR_IN	166 /* UART2 Serial Infrared Input */
+#define K210_PCF_UART2_DTR	167 /* UART2 Data Terminal Ready */
+#define K210_PCF_UART2_RTS	168 /* UART2 Request To Send */
+#define K210_PCF_UART2_OUT2	169 /* UART2 User-designated Output 2 */
+#define K210_PCF_UART2_OUT1	170 /* UART2 User-designated Output 1 */
+#define K210_PCF_UART2_SIR_OUT	171 /* UART2 Serial Infrared Output */
+#define K210_PCF_UART2_BAUD	172 /* UART2 Transmit Clock Output */
+#define K210_PCF_UART2_RE	173 /* UART2 Receiver Output Enable */
+#define K210_PCF_UART2_DE	174 /* UART2 Driver Output Enable */
+#define K210_PCF_UART2_RS485_EN	175 /* UART2 RS485 Enable */
+#define K210_PCF_UART3_CTS	176 /* UART3 Clear To Send */
+#define K210_PCF_UART3_DSR	177 /* UART3 Data Set Ready */
+#define K210_PCF_UART3_DCD	178 /* UART3 Data Carrier Detect */
+#define K210_PCF_UART3_RI	179 /* UART3 Ring Indicator */
+#define K210_PCF_UART3_SIR_IN	180 /* UART3 Serial Infrared Input */
+#define K210_PCF_UART3_DTR	181 /* UART3 Data Terminal Ready */
+#define K210_PCF_UART3_RTS	182 /* UART3 Request To Send */
+#define K210_PCF_UART3_OUT2	183 /* UART3 User-designated Output 2 */
+#define K210_PCF_UART3_OUT1	184 /* UART3 User-designated Output 1 */
+#define K210_PCF_UART3_SIR_OUT	185 /* UART3 Serial Infrared Output */
+#define K210_PCF_UART3_BAUD	186 /* UART3 Transmit Clock Output */
+#define K210_PCF_UART3_RE	187 /* UART3 Receiver Output Enable */
+#define K210_PCF_UART3_DE	188 /* UART3 Driver Output Enable */
+#define K210_PCF_UART3_RS485_EN	189 /* UART3 RS485 Enable */
+#define K210_PCF_TIMER0_TOGGLE1	190 /* TIMER0 Toggle Output 1 */
+#define K210_PCF_TIMER0_TOGGLE2	191 /* TIMER0 Toggle Output 2 */
+#define K210_PCF_TIMER0_TOGGLE3	192 /* TIMER0 Toggle Output 3 */
+#define K210_PCF_TIMER0_TOGGLE4	193 /* TIMER0 Toggle Output 4 */
+#define K210_PCF_TIMER1_TOGGLE1	194 /* TIMER1 Toggle Output 1 */
+#define K210_PCF_TIMER1_TOGGLE2	195 /* TIMER1 Toggle Output 2 */
+#define K210_PCF_TIMER1_TOGGLE3	196 /* TIMER1 Toggle Output 3 */
+#define K210_PCF_TIMER1_TOGGLE4	197 /* TIMER1 Toggle Output 4 */
+#define K210_PCF_TIMER2_TOGGLE1	198 /* TIMER2 Toggle Output 1 */
+#define K210_PCF_TIMER2_TOGGLE2	199 /* TIMER2 Toggle Output 2 */
+#define K210_PCF_TIMER2_TOGGLE3	200 /* TIMER2 Toggle Output 3 */
+#define K210_PCF_TIMER2_TOGGLE4	201 /* TIMER2 Toggle Output 4 */
+#define K210_PCF_CLK_SPI2	202 /* Clock SPI2 */
+#define K210_PCF_CLK_I2C2	203 /* Clock I2C2 */
+#define K210_PCF_INTERNAL0	204 /* Internal function signal 0 */
+#define K210_PCF_INTERNAL1	205 /* Internal function signal 1 */
+#define K210_PCF_INTERNAL2	206 /* Internal function signal 2 */
+#define K210_PCF_INTERNAL3	207 /* Internal function signal 3 */
+#define K210_PCF_INTERNAL4	208 /* Internal function signal 4 */
+#define K210_PCF_INTERNAL5	209 /* Internal function signal 5 */
+#define K210_PCF_INTERNAL6	210 /* Internal function signal 6 */
+#define K210_PCF_INTERNAL7	211 /* Internal function signal 7 */
+#define K210_PCF_INTERNAL8	212 /* Internal function signal 8 */
+#define K210_PCF_INTERNAL9	213 /* Internal function signal 9 */
+#define K210_PCF_INTERNAL10	214 /* Internal function signal 10 */
+#define K210_PCF_INTERNAL11	215 /* Internal function signal 11 */
+#define K210_PCF_INTERNAL12	216 /* Internal function signal 12 */
+#define K210_PCF_INTERNAL13	217 /* Internal function signal 13 */
+#define K210_PCF_INTERNAL14	218 /* Internal function signal 14 */
+#define K210_PCF_INTERNAL15	219 /* Internal function signal 15 */
+#define K210_PCF_INTERNAL16	220 /* Internal function signal 16 */
+#define K210_PCF_INTERNAL17	221 /* Internal function signal 17 */
+#define K210_PCF_CONSTANT	222 /* Constant function */
+#define K210_PCF_INTERNAL18	223 /* Internal function signal 18 */
+#define K210_PCF_DEBUG0		224 /* Debug function 0 */
+#define K210_PCF_DEBUG1		225 /* Debug function 1 */
+#define K210_PCF_DEBUG2		226 /* Debug function 2 */
+#define K210_PCF_DEBUG3		227 /* Debug function 3 */
+#define K210_PCF_DEBUG4		228 /* Debug function 4 */
+#define K210_PCF_DEBUG5		229 /* Debug function 5 */
+#define K210_PCF_DEBUG6		230 /* Debug function 6 */
+#define K210_PCF_DEBUG7		231 /* Debug function 7 */
+#define K210_PCF_DEBUG8		232 /* Debug function 8 */
+#define K210_PCF_DEBUG9		233 /* Debug function 9 */
+#define K210_PCF_DEBUG10	234 /* Debug function 10 */
+#define K210_PCF_DEBUG11	235 /* Debug function 11 */
+#define K210_PCF_DEBUG12	236 /* Debug function 12 */
+#define K210_PCF_DEBUG13	237 /* Debug function 13 */
+#define K210_PCF_DEBUG14	238 /* Debug function 14 */
+#define K210_PCF_DEBUG15	239 /* Debug function 15 */
+#define K210_PCF_DEBUG16	240 /* Debug function 16 */
+#define K210_PCF_DEBUG17	241 /* Debug function 17 */
+#define K210_PCF_DEBUG18	242 /* Debug function 18 */
+#define K210_PCF_DEBUG19	243 /* Debug function 19 */
+#define K210_PCF_DEBUG20	244 /* Debug function 20 */
+#define K210_PCF_DEBUG21	245 /* Debug function 21 */
+#define K210_PCF_DEBUG22	246 /* Debug function 22 */
+#define K210_PCF_DEBUG23	247 /* Debug function 23 */
+#define K210_PCF_DEBUG24	248 /* Debug function 24 */
+#define K210_PCF_DEBUG25	249 /* Debug function 25 */
+#define K210_PCF_DEBUG26	250 /* Debug function 26 */
+#define K210_PCF_DEBUG27	251 /* Debug function 27 */
+#define K210_PCF_DEBUG28	252 /* Debug function 28 */
+#define K210_PCF_DEBUG29	253 /* Debug function 29 */
+#define K210_PCF_DEBUG30	254 /* Debug function 30 */
+#define K210_PCF_DEBUG31	255 /* Debug function 31 */
+
+#define K210_FPIOA(pin, func)		(((pin) << 16) | (func))
+
+#define K210_PC_POWER_3V3	0
+#define K210_PC_POWER_1V8	1
+
+#endif /* PINCTRL_K210_FPIOA_H */
-- 
cgit v1.2.3


From ae3c107cd8bea82cb7cb427d9c5d305b8ce72216 Mon Sep 17 00:00:00 2001
From: Atish Patra <atish.patra@wdc.com>
Date: Wed, 18 Nov 2020 16:38:26 -0800
Subject: numa: Move numa implementation to common code

ARM64 numa implementation is generic enough that RISC-V can reuse that
implementation with very minor cosmetic changes. This will help both
ARM64 and RISC-V in terms of maintanace and feature improvement

Move the numa implementation code to common directory so that both ISAs
can reuse this. This doesn't introduce any function changes for ARM64.

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/arm64/Kconfig            |   1 +
 arch/arm64/include/asm/numa.h |  48 +----
 arch/arm64/mm/Makefile        |   1 -
 arch/arm64/mm/numa.c          | 483 ------------------------------------------
 drivers/base/Kconfig          |   6 +
 drivers/base/Makefile         |   1 +
 drivers/base/arch_numa.c      | 483 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/numa.h    |  52 +++++
 8 files changed, 544 insertions(+), 531 deletions(-)
 delete mode 100644 arch/arm64/mm/numa.c
 create mode 100644 drivers/base/arch_numa.c
 create mode 100644 include/asm-generic/numa.h

(limited to 'include')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 05e17351e4f3..ff9f5c05cca3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -995,6 +995,7 @@ config HOTPLUG_CPU
 # Common NUMA Features
 config NUMA
 	bool "NUMA Memory Allocation and Scheduler Support"
+	select GENERIC_ARCH_NUMA
 	select ACPI_NUMA if ACPI
 	select OF_NUMA
 	help
diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h
index ffc1dcdf1871..8c8cf4297cc3 100644
--- a/arch/arm64/include/asm/numa.h
+++ b/arch/arm64/include/asm/numa.h
@@ -3,52 +3,6 @@
 #define __ASM_NUMA_H
 
 #include <asm/topology.h>
-
-#ifdef CONFIG_NUMA
-
-#define NR_NODE_MEMBLKS		(MAX_NUMNODES * 2)
-
-int __node_distance(int from, int to);
-#define node_distance(a, b) __node_distance(a, b)
-
-extern nodemask_t numa_nodes_parsed __initdata;
-
-extern bool numa_off;
-
-/* Mappings between node number and cpus on that node. */
-extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
-void numa_clear_node(unsigned int cpu);
-
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-const struct cpumask *cpumask_of_node(int node);
-#else
-/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline const struct cpumask *cpumask_of_node(int node)
-{
-	if (node == NUMA_NO_NODE)
-		return cpu_all_mask;
-
-	return node_to_cpumask_map[node];
-}
-#endif
-
-void __init arch_numa_init(void);
-int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-void __init numa_set_distance(int from, int to, int distance);
-void __init numa_free_distance(void);
-void __init early_map_cpu_to_node(unsigned int cpu, int nid);
-void numa_store_cpu_info(unsigned int cpu);
-void numa_add_cpu(unsigned int cpu);
-void numa_remove_cpu(unsigned int cpu);
-
-#else	/* CONFIG_NUMA */
-
-static inline void numa_store_cpu_info(unsigned int cpu) { }
-static inline void numa_add_cpu(unsigned int cpu) { }
-static inline void numa_remove_cpu(unsigned int cpu) { }
-static inline void arch_numa_init(void) { }
-static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { }
-
-#endif	/* CONFIG_NUMA */
+#include <asm-generic/numa.h>
 
 #endif	/* __ASM_NUMA_H */
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 5ead3c3de3b6..cd60e4fed78f 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -6,7 +6,6 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
-obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_ARM64_MTE)		+= mteswap.o
 KASAN_SANITIZE_physaddr.o	+= n
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
deleted file mode 100644
index 0dae54ce7d43..000000000000
--- a/arch/arm64/mm/numa.c
+++ /dev/null
@@ -1,483 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * NUMA support, based on the x86 implementation.
- *
- * Copyright (C) 2015 Cavium Inc.
- * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
- */
-
-#define pr_fmt(fmt) "NUMA: " fmt
-
-#include <linux/acpi.h>
-#include <linux/memblock.h>
-#include <linux/module.h>
-#include <linux/of.h>
-
-#include <asm/sections.h>
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-nodemask_t numa_nodes_parsed __initdata;
-static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
-bool numa_off;
-
-static __init int numa_parse_early_param(char *opt)
-{
-	if (!opt)
-		return -EINVAL;
-	if (str_has_prefix(opt, "off"))
-		numa_off = true;
-
-	return 0;
-}
-early_param("numa", numa_parse_early_param);
-
-cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
-EXPORT_SYMBOL(node_to_cpumask_map);
-
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const struct cpumask *cpumask_of_node(int node)
-{
-
-	if (node == NUMA_NO_NODE)
-		return cpu_all_mask;
-
-	if (WARN_ON(node < 0 || node >= nr_node_ids))
-		return cpu_none_mask;
-
-	if (WARN_ON(node_to_cpumask_map[node] == NULL))
-		return cpu_online_mask;
-
-	return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(cpumask_of_node);
-
-#endif
-
-static void numa_update_cpu(unsigned int cpu, bool remove)
-{
-	int nid = cpu_to_node(cpu);
-
-	if (nid == NUMA_NO_NODE)
-		return;
-
-	if (remove)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
-	else
-		cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-}
-
-void numa_add_cpu(unsigned int cpu)
-{
-	numa_update_cpu(cpu, false);
-}
-
-void numa_remove_cpu(unsigned int cpu)
-{
-	numa_update_cpu(cpu, true);
-}
-
-void numa_clear_node(unsigned int cpu)
-{
-	numa_remove_cpu(cpu);
-	set_cpu_numa_node(cpu, NUMA_NO_NODE);
-}
-
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: cpumask_of_node() is not valid until after this is done.
- * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
- */
-static void __init setup_node_to_cpumask_map(void)
-{
-	int node;
-
-	/* setup nr_node_ids if not done yet */
-	if (nr_node_ids == MAX_NUMNODES)
-		setup_nr_node_ids();
-
-	/* allocate and clear the mapping */
-	for (node = 0; node < nr_node_ids; node++) {
-		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
-		cpumask_clear(node_to_cpumask_map[node]);
-	}
-
-	/* cpumask_of_node() will now work */
-	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
-}
-
-/*
- * Set the cpu to node and mem mapping
- */
-void numa_store_cpu_info(unsigned int cpu)
-{
-	set_cpu_numa_node(cpu, cpu_to_node_map[cpu]);
-}
-
-void __init early_map_cpu_to_node(unsigned int cpu, int nid)
-{
-	/* fallback to node 0 */
-	if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
-		nid = 0;
-
-	cpu_to_node_map[cpu] = nid;
-
-	/*
-	 * We should set the numa node of cpu0 as soon as possible, because it
-	 * has already been set up online before. cpu_to_node(0) will soon be
-	 * called.
-	 */
-	if (!cpu)
-		set_cpu_numa_node(cpu, nid);
-}
-
-#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
-
-static int __init early_cpu_to_node(int cpu)
-{
-	return cpu_to_node_map[cpu];
-}
-
-static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
-{
-	return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
-}
-
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
-				       size_t align)
-{
-	int nid = early_cpu_to_node(cpu);
-
-	return  memblock_alloc_try_nid(size, align,
-			__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
-}
-
-static void __init pcpu_fc_free(void *ptr, size_t size)
-{
-	memblock_free_early(__pa(ptr), size);
-}
-
-void __init setup_per_cpu_areas(void)
-{
-	unsigned long delta;
-	unsigned int cpu;
-	int rc;
-
-	/*
-	 * Always reserve area for module percpu variables.  That's
-	 * what the legacy allocator did.
-	 */
-	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
-				    pcpu_cpu_distance,
-				    pcpu_fc_alloc, pcpu_fc_free);
-	if (rc < 0)
-		panic("Failed to initialize percpu areas.");
-
-	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
-	for_each_possible_cpu(cpu)
-		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
-}
-#endif
-
-/**
- * numa_add_memblk() - Set node id to memblk
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end:  End address of the new memblk
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
-	int ret;
-
-	ret = memblock_set_node(start, (end - start), &memblock.memory, nid);
-	if (ret < 0) {
-		pr_err("memblock [0x%llx - 0x%llx] failed to add on node %d\n",
-			start, (end - 1), nid);
-		return ret;
-	}
-
-	node_set(nid, numa_nodes_parsed);
-	return ret;
-}
-
-/*
- * Initialize NODE_DATA for a node on the local memory
- */
-static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
-{
-	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
-	u64 nd_pa;
-	void *nd;
-	int tnid;
-
-	if (start_pfn >= end_pfn)
-		pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
-
-	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
-	if (!nd_pa)
-		panic("Cannot allocate %zu bytes for node %d data\n",
-		      nd_size, nid);
-
-	nd = __va(nd_pa);
-
-	/* report and initialize */
-	pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
-		nd_pa, nd_pa + nd_size - 1);
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
-		pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
-
-	node_data[nid] = nd;
-	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-	NODE_DATA(nid)->node_id = nid;
-	NODE_DATA(nid)->node_start_pfn = start_pfn;
-	NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
-}
-
-/*
- * numa_free_distance
- *
- * The current table is freed.
- */
-void __init numa_free_distance(void)
-{
-	size_t size;
-
-	if (!numa_distance)
-		return;
-
-	size = numa_distance_cnt * numa_distance_cnt *
-		sizeof(numa_distance[0]);
-
-	memblock_free(__pa(numa_distance), size);
-	numa_distance_cnt = 0;
-	numa_distance = NULL;
-}
-
-/*
- * Create a new NUMA distance table.
- */
-static int __init numa_alloc_distance(void)
-{
-	size_t size;
-	u64 phys;
-	int i, j;
-
-	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
-	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
-				      size, PAGE_SIZE);
-	if (WARN_ON(!phys))
-		return -ENOMEM;
-
-	memblock_reserve(phys, size);
-
-	numa_distance = __va(phys);
-	numa_distance_cnt = nr_node_ids;
-
-	/* fill with the default distances */
-	for (i = 0; i < numa_distance_cnt; i++)
-		for (j = 0; j < numa_distance_cnt; j++)
-			numa_distance[i * numa_distance_cnt + j] = i == j ?
-				LOCAL_DISTANCE : REMOTE_DISTANCE;
-
-	pr_debug("Initialized distance table, cnt=%d\n", numa_distance_cnt);
-
-	return 0;
-}
-
-/**
- * numa_set_distance() - Set inter node NUMA distance from node to node.
- * @from: the 'from' node to set distance
- * @to: the 'to'  node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance.
- * If distance table doesn't exist, a warning is printed.
- *
- * If @from or @to is higher than the highest known node or lower than zero
- * or @distance doesn't make sense, the call is ignored.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
-	if (!numa_distance) {
-		pr_warn_once("Warning: distance table not allocated yet\n");
-		return;
-	}
-
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
-			from < 0 || to < 0) {
-		pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
-			    from, to, distance);
-		return;
-	}
-
-	if ((u8)distance != distance ||
-	    (from == to && distance != LOCAL_DISTANCE)) {
-		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
-			     from, to, distance);
-		return;
-	}
-
-	numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-/*
- * Return NUMA distance @from to @to
- */
-int __node_distance(int from, int to)
-{
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
-		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
-	return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-static int __init numa_register_nodes(void)
-{
-	int nid;
-	struct memblock_region *mblk;
-
-	/* Check that valid nid is set to memblks */
-	for_each_mem_region(mblk) {
-		int mblk_nid = memblock_get_region_node(mblk);
-
-		if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
-			pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
-				mblk_nid, mblk->base,
-				mblk->base + mblk->size - 1);
-			return -EINVAL;
-		}
-	}
-
-	/* Finally register nodes. */
-	for_each_node_mask(nid, numa_nodes_parsed) {
-		unsigned long start_pfn, end_pfn;
-
-		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-		setup_node_data(nid, start_pfn, end_pfn);
-		node_set_online(nid);
-	}
-
-	/* Setup online nodes to actual nodes*/
-	node_possible_map = numa_nodes_parsed;
-
-	return 0;
-}
-
-static int __init numa_init(int (*init_func)(void))
-{
-	int ret;
-
-	nodes_clear(numa_nodes_parsed);
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-
-	ret = numa_alloc_distance();
-	if (ret < 0)
-		return ret;
-
-	ret = init_func();
-	if (ret < 0)
-		goto out_free_distance;
-
-	if (nodes_empty(numa_nodes_parsed)) {
-		pr_info("No NUMA configuration found\n");
-		ret = -EINVAL;
-		goto out_free_distance;
-	}
-
-	ret = numa_register_nodes();
-	if (ret < 0)
-		goto out_free_distance;
-
-	setup_node_to_cpumask_map();
-
-	return 0;
-out_free_distance:
-	numa_free_distance();
-	return ret;
-}
-
-/**
- * dummy_numa_init() - Fallback dummy NUMA init
- *
- * Used if there's no underlying NUMA architecture, NUMA initialization
- * fails, or NUMA is disabled on the command line.
- *
- * Must online at least one node (node 0) and add memory blocks that cover all
- * allowed memory. It is unlikely that this function fails.
- *
- * Return: 0 on success, -errno on failure.
- */
-static int __init dummy_numa_init(void)
-{
-	phys_addr_t start = memblock_start_of_DRAM();
-	phys_addr_t end = memblock_end_of_DRAM();
-	int ret;
-
-	if (numa_off)
-		pr_info("NUMA disabled\n"); /* Forced off on command line. */
-	pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
-
-	ret = numa_add_memblk(0, start, end);
-	if (ret) {
-		pr_err("NUMA init failed\n");
-		return ret;
-	}
-
-	numa_off = true;
-	return 0;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-static int __init arch_acpi_numa_init(void)
-{
-	int ret;
-
-	ret = acpi_numa_init();
-	if (ret) {
-		pr_info("Failed to initialise from firmware\n");
-		return ret;
-	}
-
-	return srat_disabled() ? -EINVAL : 0;
-}
-#else
-static int __init arch_acpi_numa_init(void)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
-/**
- * arch_numa_init() - Initialize NUMA
- *
- * Try each configured NUMA initialization method until one succeeds. The
- * last fallback is dummy single node config encompassing whole memory.
- */
-void __init arch_numa_init(void)
-{
-	if (!numa_off) {
-		if (!acpi_disabled && !numa_init(arch_acpi_numa_init))
-			return;
-		if (acpi_disabled && !numa_init(of_numa_init))
-			return;
-	}
-
-	numa_init(dummy_numa_init);
-}
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 040be48ce046..dbd88e2be88a 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -213,4 +213,10 @@ config GENERIC_ARCH_TOPOLOGY
 	  appropriate scaling, sysfs interface for reading capacity values at
 	  runtime.
 
+config GENERIC_ARCH_NUMA
+	bool
+	help
+	  Enable support for generic NUMA implementation. Currently, RISC-V
+	  and ARM64 use it.
+
 endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 5e7bf9669a81..8b93a7f291ec 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_PINCTRL) += pinctrl.o
 obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
 obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
 obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
+obj-$(CONFIG_GENERIC_ARCH_NUMA) += arch_numa.o
 
 obj-y			+= test/
 
diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
new file mode 100644
index 000000000000..0dae54ce7d43
--- /dev/null
+++ b/drivers/base/arch_numa.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NUMA support, based on the x86 implementation.
+ *
+ * Copyright (C) 2015 Cavium Inc.
+ * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com>
+ */
+
+#define pr_fmt(fmt) "NUMA: " fmt
+
+#include <linux/acpi.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+#include <asm/sections.h>
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+nodemask_t numa_nodes_parsed __initdata;
+static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+bool numa_off;
+
+static __init int numa_parse_early_param(char *opt)
+{
+	if (!opt)
+		return -EINVAL;
+	if (str_has_prefix(opt, "off"))
+		numa_off = true;
+
+	return 0;
+}
+early_param("numa", numa_parse_early_param);
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+
+	if (node == NUMA_NO_NODE)
+		return cpu_all_mask;
+
+	if (WARN_ON(node < 0 || node >= nr_node_ids))
+		return cpu_none_mask;
+
+	if (WARN_ON(node_to_cpumask_map[node] == NULL))
+		return cpu_online_mask;
+
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+#endif
+
+static void numa_update_cpu(unsigned int cpu, bool remove)
+{
+	int nid = cpu_to_node(cpu);
+
+	if (nid == NUMA_NO_NODE)
+		return;
+
+	if (remove)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]);
+	else
+		cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_add_cpu(unsigned int cpu)
+{
+	numa_update_cpu(cpu, false);
+}
+
+void numa_remove_cpu(unsigned int cpu)
+{
+	numa_update_cpu(cpu, true);
+}
+
+void numa_clear_node(unsigned int cpu)
+{
+	numa_remove_cpu(cpu);
+	set_cpu_numa_node(cpu, NUMA_NO_NODE);
+}
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+	int node;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES)
+		setup_nr_node_ids();
+
+	/* allocate and clear the mapping */
+	for (node = 0; node < nr_node_ids; node++) {
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+		cpumask_clear(node_to_cpumask_map[node]);
+	}
+
+	/* cpumask_of_node() will now work */
+	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
+}
+
+/*
+ * Set the cpu to node and mem mapping
+ */
+void numa_store_cpu_info(unsigned int cpu)
+{
+	set_cpu_numa_node(cpu, cpu_to_node_map[cpu]);
+}
+
+void __init early_map_cpu_to_node(unsigned int cpu, int nid)
+{
+	/* fallback to node 0 */
+	if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
+		nid = 0;
+
+	cpu_to_node_map[cpu] = nid;
+
+	/*
+	 * We should set the numa node of cpu0 as soon as possible, because it
+	 * has already been set up online before. cpu_to_node(0) will soon be
+	 * called.
+	 */
+	if (!cpu)
+		set_cpu_numa_node(cpu, nid);
+}
+
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static int __init early_cpu_to_node(int cpu)
+{
+	return cpu_to_node_map[cpu];
+}
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
+}
+
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
+				       size_t align)
+{
+	int nid = early_cpu_to_node(cpu);
+
+	return  memblock_alloc_try_nid(size, align,
+			__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+	memblock_free_early(__pa(ptr), size);
+}
+
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long delta;
+	unsigned int cpu;
+	int rc;
+
+	/*
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
+	 */
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+				    pcpu_cpu_distance,
+				    pcpu_fc_alloc, pcpu_fc_free);
+	if (rc < 0)
+		panic("Failed to initialize percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+}
+#endif
+
+/**
+ * numa_add_memblk() - Set node id to memblk
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end:  End address of the new memblk
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	int ret;
+
+	ret = memblock_set_node(start, (end - start), &memblock.memory, nid);
+	if (ret < 0) {
+		pr_err("memblock [0x%llx - 0x%llx] failed to add on node %d\n",
+			start, (end - 1), nid);
+		return ret;
+	}
+
+	node_set(nid, numa_nodes_parsed);
+	return ret;
+}
+
+/*
+ * Initialize NODE_DATA for a node on the local memory
+ */
+static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
+{
+	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
+	u64 nd_pa;
+	void *nd;
+	int tnid;
+
+	if (start_pfn >= end_pfn)
+		pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
+
+	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+	if (!nd_pa)
+		panic("Cannot allocate %zu bytes for node %d data\n",
+		      nd_size, nid);
+
+	nd = __va(nd_pa);
+
+	/* report and initialize */
+	pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
+		nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = nd;
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+	NODE_DATA(nid)->node_id = nid;
+	NODE_DATA(nid)->node_start_pfn = start_pfn;
+	NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
+}
+
+/*
+ * numa_free_distance
+ *
+ * The current table is freed.
+ */
+void __init numa_free_distance(void)
+{
+	size_t size;
+
+	if (!numa_distance)
+		return;
+
+	size = numa_distance_cnt * numa_distance_cnt *
+		sizeof(numa_distance[0]);
+
+	memblock_free(__pa(numa_distance), size);
+	numa_distance_cnt = 0;
+	numa_distance = NULL;
+}
+
+/*
+ * Create a new NUMA distance table.
+ */
+static int __init numa_alloc_distance(void)
+{
+	size_t size;
+	u64 phys;
+	int i, j;
+
+	size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
+	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+				      size, PAGE_SIZE);
+	if (WARN_ON(!phys))
+		return -ENOMEM;
+
+	memblock_reserve(phys, size);
+
+	numa_distance = __va(phys);
+	numa_distance_cnt = nr_node_ids;
+
+	/* fill with the default distances */
+	for (i = 0; i < numa_distance_cnt; i++)
+		for (j = 0; j < numa_distance_cnt; j++)
+			numa_distance[i * numa_distance_cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+
+	pr_debug("Initialized distance table, cnt=%d\n", numa_distance_cnt);
+
+	return 0;
+}
+
+/**
+ * numa_set_distance() - Set inter node NUMA distance from node to node.
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.
+ * If distance table doesn't exist, a warning is printed.
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * or @distance doesn't make sense, the call is ignored.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	if (!numa_distance) {
+		pr_warn_once("Warning: distance table not allocated yet\n");
+		return;
+	}
+
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
+		return;
+	}
+
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
+		return;
+	}
+
+	numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+/*
+ * Return NUMA distance @from to @to
+ */
+int __node_distance(int from, int to)
+{
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+static int __init numa_register_nodes(void)
+{
+	int nid;
+	struct memblock_region *mblk;
+
+	/* Check that valid nid is set to memblks */
+	for_each_mem_region(mblk) {
+		int mblk_nid = memblock_get_region_node(mblk);
+
+		if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
+			pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+				mblk_nid, mblk->base,
+				mblk->base + mblk->size - 1);
+			return -EINVAL;
+		}
+	}
+
+	/* Finally register nodes. */
+	for_each_node_mask(nid, numa_nodes_parsed) {
+		unsigned long start_pfn, end_pfn;
+
+		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+		setup_node_data(nid, start_pfn, end_pfn);
+		node_set_online(nid);
+	}
+
+	/* Setup online nodes to actual nodes*/
+	node_possible_map = numa_nodes_parsed;
+
+	return 0;
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+	int ret;
+
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+
+	ret = numa_alloc_distance();
+	if (ret < 0)
+		return ret;
+
+	ret = init_func();
+	if (ret < 0)
+		goto out_free_distance;
+
+	if (nodes_empty(numa_nodes_parsed)) {
+		pr_info("No NUMA configuration found\n");
+		ret = -EINVAL;
+		goto out_free_distance;
+	}
+
+	ret = numa_register_nodes();
+	if (ret < 0)
+		goto out_free_distance;
+
+	setup_node_to_cpumask_map();
+
+	return 0;
+out_free_distance:
+	numa_free_distance();
+	return ret;
+}
+
+/**
+ * dummy_numa_init() - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node (node 0) and add memory blocks that cover all
+ * allowed memory. It is unlikely that this function fails.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+static int __init dummy_numa_init(void)
+{
+	phys_addr_t start = memblock_start_of_DRAM();
+	phys_addr_t end = memblock_end_of_DRAM();
+	int ret;
+
+	if (numa_off)
+		pr_info("NUMA disabled\n"); /* Forced off on command line. */
+	pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
+
+	ret = numa_add_memblk(0, start, end);
+	if (ret) {
+		pr_err("NUMA init failed\n");
+		return ret;
+	}
+
+	numa_off = true;
+	return 0;
+}
+
+#ifdef CONFIG_ACPI_NUMA
+static int __init arch_acpi_numa_init(void)
+{
+	int ret;
+
+	ret = acpi_numa_init();
+	if (ret) {
+		pr_info("Failed to initialise from firmware\n");
+		return ret;
+	}
+
+	return srat_disabled() ? -EINVAL : 0;
+}
+#else
+static int __init arch_acpi_numa_init(void)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+/**
+ * arch_numa_init() - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encompassing whole memory.
+ */
+void __init arch_numa_init(void)
+{
+	if (!numa_off) {
+		if (!acpi_disabled && !numa_init(arch_acpi_numa_init))
+			return;
+		if (acpi_disabled && !numa_init(of_numa_init))
+			return;
+	}
+
+	numa_init(dummy_numa_init);
+}
diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h
new file mode 100644
index 000000000000..1a3ad6d29833
--- /dev/null
+++ b/include/asm-generic/numa.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_GENERIC_NUMA_H
+#define __ASM_GENERIC_NUMA_H
+
+#ifdef CONFIG_NUMA
+
+#define NR_NODE_MEMBLKS		(MAX_NUMNODES * 2)
+
+int __node_distance(int from, int to);
+#define node_distance(a, b) __node_distance(a, b)
+
+extern nodemask_t numa_nodes_parsed __initdata;
+
+extern bool numa_off;
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+void numa_clear_node(unsigned int cpu);
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+const struct cpumask *cpumask_of_node(int node);
+#else
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+	if (node == NUMA_NO_NODE)
+		return cpu_all_mask;
+
+	return node_to_cpumask_map[node];
+}
+#endif
+
+void __init arch_numa_init(void);
+int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+void __init numa_set_distance(int from, int to, int distance);
+void __init numa_free_distance(void);
+void __init early_map_cpu_to_node(unsigned int cpu, int nid);
+void numa_store_cpu_info(unsigned int cpu);
+void numa_add_cpu(unsigned int cpu);
+void numa_remove_cpu(unsigned int cpu);
+
+#else	/* CONFIG_NUMA */
+
+static inline void numa_store_cpu_info(unsigned int cpu) { }
+static inline void numa_add_cpu(unsigned int cpu) { }
+static inline void numa_remove_cpu(unsigned int cpu) { }
+static inline void arch_numa_init(void) { }
+static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { }
+
+#endif	/* CONFIG_NUMA */
+
+#endif	/* __ASM_GENERIC_NUMA_H */
-- 
cgit v1.2.3


From b33752c300232d7f95dd9a4353947d0c9e6a0e52 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 15 Jan 2021 09:06:37 -0800
Subject: HID: i2c-hid: Reorganize so ACPI and OF are separate modules

This patch rejiggers the i2c-hid code so that the OF (Open Firmware
aka Device Tree) and ACPI support is separated out a bit.  The OF and
ACPI drivers are now separate modules that wrap the core module.

Essentially, what we're doing here:
* Make "power up" and "power down" a function that can be (optionally)
  implemented by a given user of the i2c-hid core.
* The OF and ACPI modules are drivers on their own, so they implement
  probe / remove / suspend / resume / shutdown.  The core code
  provides implementations that OF and ACPI can call into.

We'll organize this so that we now have 3 modules: the old i2c-hid
module becomes the "core" module and two new modules will depend on
it, handling probing the specific device.

As part of this work, we'll remove the i2c-hid "platform data"
concept since it's not needed.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/Makefile                  |   2 +-
 drivers/hid/i2c-hid/Kconfig           |  32 ++++-
 drivers/hid/i2c-hid/Makefile          |   5 +-
 drivers/hid/i2c-hid/i2c-hid-acpi.c    | 143 +++++++++++++++++++
 drivers/hid/i2c-hid/i2c-hid-core.c    | 252 ++++++----------------------------
 drivers/hid/i2c-hid/i2c-hid-of.c      | 143 +++++++++++++++++++
 drivers/hid/i2c-hid/i2c-hid.h         |  22 +++
 include/linux/platform_data/i2c-hid.h |  41 ------
 8 files changed, 379 insertions(+), 261 deletions(-)
 create mode 100644 drivers/hid/i2c-hid/i2c-hid-acpi.c
 create mode 100644 drivers/hid/i2c-hid/i2c-hid-of.c
 delete mode 100644 include/linux/platform_data/i2c-hid.h

(limited to 'include')

diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 014d21fe7dac..a0621a4a65cd 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -138,7 +138,7 @@ obj-$(CONFIG_USB_HID)		+= usbhid/
 obj-$(CONFIG_USB_MOUSE)		+= usbhid/
 obj-$(CONFIG_USB_KBD)		+= usbhid/
 
-obj-$(CONFIG_I2C_HID)		+= i2c-hid/
+obj-$(CONFIG_I2C_HID_CORE)	+= i2c-hid/
 
 obj-$(CONFIG_INTEL_ISH_HID)	+= intel-ish-hid/
 obj-$(INTEL_ISH_FIRMWARE_DOWNLOADER)	+= intel-ish-hid/
diff --git a/drivers/hid/i2c-hid/Kconfig b/drivers/hid/i2c-hid/Kconfig
index c4e5dfeab2bd..819b7521c182 100644
--- a/drivers/hid/i2c-hid/Kconfig
+++ b/drivers/hid/i2c-hid/Kconfig
@@ -2,18 +2,40 @@
 menu "I2C HID support"
 	depends on I2C
 
-config I2C_HID
-	tristate "HID over I2C transport layer"
+config I2C_HID_ACPI
+	tristate "HID over I2C transport layer ACPI driver"
 	default n
-	depends on I2C && INPUT
-	select HID
+	depends on I2C && INPUT && ACPI
+	help
+	  Say Y here if you use a keyboard, a touchpad, a touchscreen, or any
+	  other HID based devices which is connected to your computer via I2C.
+	  This driver supports ACPI-based systems.
+
+	  If unsure, say N.
+
+	  This support is also available as a module.  If so, the module
+	  will be called i2c-hid-acpi.  It will also build/depend on the
+	  module i2c-hid.
+
+config I2C_HID_OF
+	tristate "HID over I2C transport layer Open Firmware driver"
+	default n
+	depends on I2C && INPUT && OF
 	help
 	  Say Y here if you use a keyboard, a touchpad, a touchscreen, or any
 	  other HID based devices which is connected to your computer via I2C.
+	  This driver supports Open Firmware (Device Tree)-based systems.
 
 	  If unsure, say N.
 
 	  This support is also available as a module.  If so, the module
-	  will be called i2c-hid.
+	  will be called i2c-hid-of.  It will also build/depend on the
+	  module i2c-hid.
 
 endmenu
+
+config I2C_HID_CORE
+	tristate
+	default y if I2C_HID_ACPI=y || I2C_HID_OF=y
+	default m if I2C_HID_ACPI=m || I2C_HID_OF=m
+	select HID
diff --git a/drivers/hid/i2c-hid/Makefile b/drivers/hid/i2c-hid/Makefile
index 681b3896898e..9b4a73446841 100644
--- a/drivers/hid/i2c-hid/Makefile
+++ b/drivers/hid/i2c-hid/Makefile
@@ -3,7 +3,10 @@
 # Makefile for the I2C input drivers
 #
 
-obj-$(CONFIG_I2C_HID)				+= i2c-hid.o
+obj-$(CONFIG_I2C_HID_CORE)			+= i2c-hid.o
 
 i2c-hid-objs					=  i2c-hid-core.o
 i2c-hid-$(CONFIG_DMI)				+= i2c-hid-dmi-quirks.o
+
+obj-$(CONFIG_I2C_HID_ACPI)			+= i2c-hid-acpi.o
+obj-$(CONFIG_I2C_HID_OF)			+= i2c-hid-of.o
diff --git a/drivers/hid/i2c-hid/i2c-hid-acpi.c b/drivers/hid/i2c-hid/i2c-hid-acpi.c
new file mode 100644
index 000000000000..bb8c00e6be78
--- /dev/null
+++ b/drivers/hid/i2c-hid/i2c-hid-acpi.c
@@ -0,0 +1,143 @@
+/*
+ * HID over I2C ACPI Subclass
+ *
+ * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
+ * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
+ * Copyright (c) 2012 Red Hat, Inc
+ *
+ * This code was forked out of the core code, which was partly based on
+ * "USB HID support for Linux":
+ *
+ *  Copyright (c) 1999 Andreas Gal
+ *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
+ *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
+ *  Copyright (c) 2007-2008 Oliver Neukum
+ *  Copyright (c) 2006-2010 Jiri Kosina
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+
+#include "i2c-hid.h"
+
+struct i2c_hid_acpi {
+	struct i2chid_ops ops;
+	struct i2c_client *client;
+};
+
+static const struct acpi_device_id i2c_hid_acpi_blacklist[] = {
+	/*
+	 * The CHPN0001 ACPI device, which is used to describe the Chipone
+	 * ICN8505 controller, has a _CID of PNP0C50 but is not HID compatible.
+	 */
+	{"CHPN0001", 0 },
+	{ },
+};
+
+static int i2c_hid_acpi_get_descriptor(struct i2c_client *client)
+{
+	static guid_t i2c_hid_guid =
+		GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
+			  0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
+	union acpi_object *obj;
+	struct acpi_device *adev;
+	acpi_handle handle;
+	u16 hid_descriptor_address;
+
+	handle = ACPI_HANDLE(&client->dev);
+	if (!handle || acpi_bus_get_device(handle, &adev)) {
+		dev_err(&client->dev, "Error could not get ACPI device\n");
+		return -ENODEV;
+	}
+
+	if (acpi_match_device_ids(adev, i2c_hid_acpi_blacklist) == 0)
+		return -ENODEV;
+
+	obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL,
+				      ACPI_TYPE_INTEGER);
+	if (!obj) {
+		dev_err(&client->dev, "Error _DSM call to get HID descriptor address failed\n");
+		return -ENODEV;
+	}
+
+	hid_descriptor_address = obj->integer.value;
+	ACPI_FREE(obj);
+
+	return hid_descriptor_address;
+}
+
+static void i2c_hid_acpi_shutdown_tail(struct i2chid_ops *ops)
+{
+	struct i2c_hid_acpi *ihid_acpi =
+		container_of(ops, struct i2c_hid_acpi, ops);
+	struct device *dev = &ihid_acpi->client->dev;
+	acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D3_COLD);
+}
+
+static int i2c_hid_acpi_probe(struct i2c_client *client,
+			      const struct i2c_device_id *dev_id)
+{
+	struct device *dev = &client->dev;
+	struct i2c_hid_acpi *ihid_acpi;
+	struct acpi_device *adev;
+	u16 hid_descriptor_address;
+	int ret;
+
+	ihid_acpi = devm_kzalloc(&client->dev, sizeof(*ihid_acpi), GFP_KERNEL);
+	if (!ihid_acpi)
+		return -ENOMEM;
+
+	ihid_acpi->client = client;
+	ihid_acpi->ops.shutdown_tail = i2c_hid_acpi_shutdown_tail;
+
+	ret = i2c_hid_acpi_get_descriptor(client);
+	if (ret < 0)
+		return ret;
+	hid_descriptor_address = ret;
+
+	adev = ACPI_COMPANION(dev);
+	if (adev)
+		acpi_device_fix_up_power(adev);
+
+	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) {
+		device_set_wakeup_capable(dev, true);
+		device_set_wakeup_enable(dev, false);
+	}
+
+	return i2c_hid_core_probe(client, &ihid_acpi->ops,
+				  hid_descriptor_address);
+}
+
+static const struct acpi_device_id i2c_hid_acpi_match[] = {
+	{"ACPI0C50", 0 },
+	{"PNP0C50", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, i2c_hid_acpi_match);
+
+static struct i2c_driver i2c_hid_acpi_driver = {
+	.driver = {
+		.name	= "i2c_hid_acpi",
+		.pm	= &i2c_hid_core_pm,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+		.acpi_match_table = ACPI_PTR(i2c_hid_acpi_match),
+	},
+
+	.probe		= i2c_hid_acpi_probe,
+	.remove		= i2c_hid_core_remove,
+	.shutdown	= i2c_hid_core_shutdown,
+};
+
+module_i2c_driver(i2c_hid_acpi_driver);
+
+MODULE_DESCRIPTION("HID over I2C ACPI driver");
+MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c
index bfe716d7ea44..b64441db29a8 100644
--- a/drivers/hid/i2c-hid/i2c-hid-core.c
+++ b/drivers/hid/i2c-hid/i2c-hid-core.c
@@ -35,11 +35,6 @@
 #include <linux/kernel.h>
 #include <linux/hid.h>
 #include <linux/mutex.h>
-#include <linux/acpi.h>
-#include <linux/of.h>
-#include <linux/regulator/consumer.h>
-
-#include <linux/platform_data/i2c-hid.h>
 
 #include "../hid-ids.h"
 #include "i2c-hid.h"
@@ -156,10 +151,10 @@ struct i2c_hid {
 
 	wait_queue_head_t	wait;		/* For waiting the interrupt */
 
-	struct i2c_hid_platform_data pdata;
-
 	bool			irq_wake_enabled;
 	struct mutex		reset_lock;
+
+	struct i2chid_ops	*ops;
 };
 
 static const struct i2c_hid_quirks {
@@ -884,144 +879,36 @@ static int i2c_hid_fetch_hid_descriptor(struct i2c_hid *ihid)
 	return 0;
 }
 
-#ifdef CONFIG_ACPI
-static const struct acpi_device_id i2c_hid_acpi_blacklist[] = {
-	/*
-	 * The CHPN0001 ACPI device, which is used to describe the Chipone
-	 * ICN8505 controller, has a _CID of PNP0C50 but is not HID compatible.
-	 */
-	{"CHPN0001", 0 },
-	{ },
-};
-
-static int i2c_hid_acpi_pdata(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	static guid_t i2c_hid_guid =
-		GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
-			  0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
-	union acpi_object *obj;
-	struct acpi_device *adev;
-	acpi_handle handle;
-
-	handle = ACPI_HANDLE(&client->dev);
-	if (!handle || acpi_bus_get_device(handle, &adev)) {
-		dev_err(&client->dev, "Error could not get ACPI device\n");
-		return -ENODEV;
-	}
-
-	if (acpi_match_device_ids(adev, i2c_hid_acpi_blacklist) == 0)
-		return -ENODEV;
-
-	obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL,
-				      ACPI_TYPE_INTEGER);
-	if (!obj) {
-		dev_err(&client->dev, "Error _DSM call to get HID descriptor address failed\n");
-		return -ENODEV;
-	}
-
-	pdata->hid_descriptor_address = obj->integer.value;
-	ACPI_FREE(obj);
-
-	return 0;
-}
-
-static void i2c_hid_acpi_fix_up_power(struct device *dev)
-{
-	struct acpi_device *adev;
-
-	adev = ACPI_COMPANION(dev);
-	if (adev)
-		acpi_device_fix_up_power(adev);
-}
-
-static void i2c_hid_acpi_enable_wakeup(struct device *dev)
-{
-	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) {
-		device_set_wakeup_capable(dev, true);
-		device_set_wakeup_enable(dev, false);
-	}
-}
-
-static void i2c_hid_acpi_shutdown(struct device *dev)
+static int i2c_hid_core_power_up(struct i2c_hid *ihid)
 {
-	acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D3_COLD);
-}
+	if (!ihid->ops->power_up)
+		return 0;
 
-static const struct acpi_device_id i2c_hid_acpi_match[] = {
-	{"ACPI0C50", 0 },
-	{"PNP0C50", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(acpi, i2c_hid_acpi_match);
-#else
-static inline int i2c_hid_acpi_pdata(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	return -ENODEV;
+	return ihid->ops->power_up(ihid->ops);
 }
 
-static inline void i2c_hid_acpi_fix_up_power(struct device *dev) {}
-
-static inline void i2c_hid_acpi_enable_wakeup(struct device *dev) {}
-
-static inline void i2c_hid_acpi_shutdown(struct device *dev) {}
-#endif
-
-#ifdef CONFIG_OF
-static int i2c_hid_of_probe(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
+static void i2c_hid_core_power_down(struct i2c_hid *ihid)
 {
-	struct device *dev = &client->dev;
-	u32 val;
-	int ret;
-
-	ret = of_property_read_u32(dev->of_node, "hid-descr-addr", &val);
-	if (ret) {
-		dev_err(&client->dev, "HID register address not provided\n");
-		return -ENODEV;
-	}
-	if (val >> 16) {
-		dev_err(&client->dev, "Bad HID register address: 0x%08x\n",
-			val);
-		return -EINVAL;
-	}
-	pdata->hid_descriptor_address = val;
-
-	return 0;
-}
+	if (!ihid->ops->power_down)
+		return;
 
-static const struct of_device_id i2c_hid_of_match[] = {
-	{ .compatible = "hid-over-i2c" },
-	{},
-};
-MODULE_DEVICE_TABLE(of, i2c_hid_of_match);
-#else
-static inline int i2c_hid_of_probe(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	return -ENODEV;
+	ihid->ops->power_down(ihid->ops);
 }
-#endif
 
-static void i2c_hid_fwnode_probe(struct i2c_client *client,
-				 struct i2c_hid_platform_data *pdata)
+static void i2c_hid_core_shutdown_tail(struct i2c_hid *ihid)
 {
-	u32 val;
+	if (!ihid->ops->shutdown_tail)
+		return;
 
-	if (!device_property_read_u32(&client->dev, "post-power-on-delay-ms",
-				      &val))
-		pdata->post_power_delay_ms = val;
+	ihid->ops->shutdown_tail(ihid->ops);
 }
 
-static int i2c_hid_probe(struct i2c_client *client,
-			 const struct i2c_device_id *dev_id)
+int i2c_hid_core_probe(struct i2c_client *client, struct i2chid_ops *ops,
+		       u16 hid_descriptor_address)
 {
 	int ret;
 	struct i2c_hid *ihid;
 	struct hid_device *hid;
-	__u16 hidRegister;
-	struct i2c_hid_platform_data *platform_data = client->dev.platform_data;
 
 	dbg_hid("HID probe called for i2c 0x%02x\n", client->addr);
 
@@ -1042,44 +929,17 @@ static int i2c_hid_probe(struct i2c_client *client,
 	if (!ihid)
 		return -ENOMEM;
 
-	if (client->dev.of_node) {
-		ret = i2c_hid_of_probe(client, &ihid->pdata);
-		if (ret)
-			return ret;
-	} else if (!platform_data) {
-		ret = i2c_hid_acpi_pdata(client, &ihid->pdata);
-		if (ret)
-			return ret;
-	} else {
-		ihid->pdata = *platform_data;
-	}
-
-	/* Parse platform agnostic common properties from ACPI / device tree */
-	i2c_hid_fwnode_probe(client, &ihid->pdata);
-
-	ihid->pdata.supplies[0].supply = "vdd";
-	ihid->pdata.supplies[1].supply = "vddl";
+	ihid->ops = ops;
 
-	ret = devm_regulator_bulk_get(&client->dev,
-				      ARRAY_SIZE(ihid->pdata.supplies),
-				      ihid->pdata.supplies);
+	ret = i2c_hid_core_power_up(ihid);
 	if (ret)
 		return ret;
 
-	ret = regulator_bulk_enable(ARRAY_SIZE(ihid->pdata.supplies),
-				    ihid->pdata.supplies);
-	if (ret < 0)
-		return ret;
-
-	if (ihid->pdata.post_power_delay_ms)
-		msleep(ihid->pdata.post_power_delay_ms);
-
 	i2c_set_clientdata(client, ihid);
 
 	ihid->client = client;
 
-	hidRegister = ihid->pdata.hid_descriptor_address;
-	ihid->wHIDDescRegister = cpu_to_le16(hidRegister);
+	ihid->wHIDDescRegister = cpu_to_le16(hid_descriptor_address);
 
 	init_waitqueue_head(&ihid->wait);
 	mutex_init(&ihid->reset_lock);
@@ -1089,11 +949,7 @@ static int i2c_hid_probe(struct i2c_client *client,
 	 * real computation later. */
 	ret = i2c_hid_alloc_buffers(ihid, HID_MIN_BUFFER_SIZE);
 	if (ret < 0)
-		goto err_regulator;
-
-	i2c_hid_acpi_fix_up_power(&client->dev);
-
-	i2c_hid_acpi_enable_wakeup(&client->dev);
+		goto err_powered;
 
 	device_enable_async_suspend(&client->dev);
 
@@ -1102,19 +958,19 @@ static int i2c_hid_probe(struct i2c_client *client,
 	if (ret < 0) {
 		dev_dbg(&client->dev, "nothing at this address: %d\n", ret);
 		ret = -ENXIO;
-		goto err_regulator;
+		goto err_powered;
 	}
 
 	ret = i2c_hid_fetch_hid_descriptor(ihid);
 	if (ret < 0) {
 		dev_err(&client->dev,
 			"Failed to fetch the HID Descriptor\n");
-		goto err_regulator;
+		goto err_powered;
 	}
 
 	ret = i2c_hid_init_irq(client);
 	if (ret < 0)
-		goto err_regulator;
+		goto err_powered;
 
 	hid = hid_allocate_device();
 	if (IS_ERR(hid)) {
@@ -1153,14 +1009,14 @@ err_mem_free:
 err_irq:
 	free_irq(client->irq, ihid);
 
-err_regulator:
-	regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-			       ihid->pdata.supplies);
+err_powered:
+	i2c_hid_core_power_down(ihid);
 	i2c_hid_free_buffers(ihid);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_probe);
 
-static int i2c_hid_remove(struct i2c_client *client)
+int i2c_hid_core_remove(struct i2c_client *client)
 {
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
 	struct hid_device *hid;
@@ -1173,24 +1029,25 @@ static int i2c_hid_remove(struct i2c_client *client)
 	if (ihid->bufsize)
 		i2c_hid_free_buffers(ihid);
 
-	regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-			       ihid->pdata.supplies);
+	i2c_hid_core_power_down(ihid);
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_remove);
 
-static void i2c_hid_shutdown(struct i2c_client *client)
+void i2c_hid_core_shutdown(struct i2c_client *client)
 {
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
 
 	i2c_hid_set_power(client, I2C_HID_PWR_SLEEP);
 	free_irq(client->irq, ihid);
 
-	i2c_hid_acpi_shutdown(&client->dev);
+	i2c_hid_core_shutdown_tail(ihid);
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_shutdown);
 
 #ifdef CONFIG_PM_SLEEP
-static int i2c_hid_suspend(struct device *dev)
+static int i2c_hid_core_suspend(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
@@ -1217,14 +1074,13 @@ static int i2c_hid_suspend(struct device *dev)
 			hid_warn(hid, "Failed to enable irq wake: %d\n",
 				wake_status);
 	} else {
-		regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-				       ihid->pdata.supplies);
+		i2c_hid_core_power_down(ihid);
 	}
 
 	return 0;
 }
 
-static int i2c_hid_resume(struct device *dev)
+static int i2c_hid_core_resume(struct device *dev)
 {
 	int ret;
 	struct i2c_client *client = to_i2c_client(dev);
@@ -1233,13 +1089,7 @@ static int i2c_hid_resume(struct device *dev)
 	int wake_status;
 
 	if (!device_may_wakeup(&client->dev)) {
-		ret = regulator_bulk_enable(ARRAY_SIZE(ihid->pdata.supplies),
-					    ihid->pdata.supplies);
-		if (ret)
-			hid_warn(hid, "Failed to enable supplies: %d\n", ret);
-
-		if (ihid->pdata.post_power_delay_ms)
-			msleep(ihid->pdata.post_power_delay_ms);
+		i2c_hid_core_power_up(ihid);
 	} else if (ihid->irq_wake_enabled) {
 		wake_status = disable_irq_wake(client->irq);
 		if (!wake_status)
@@ -1276,34 +1126,10 @@ static int i2c_hid_resume(struct device *dev)
 }
 #endif
 
-static const struct dev_pm_ops i2c_hid_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(i2c_hid_suspend, i2c_hid_resume)
+const struct dev_pm_ops i2c_hid_core_pm = {
+	SET_SYSTEM_SLEEP_PM_OPS(i2c_hid_core_suspend, i2c_hid_core_resume)
 };
-
-static const struct i2c_device_id i2c_hid_id_table[] = {
-	{ "hid", 0 },
-	{ "hid-over-i2c", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, i2c_hid_id_table);
-
-
-static struct i2c_driver i2c_hid_driver = {
-	.driver = {
-		.name	= "i2c_hid",
-		.pm	= &i2c_hid_pm,
-		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
-		.acpi_match_table = ACPI_PTR(i2c_hid_acpi_match),
-		.of_match_table = of_match_ptr(i2c_hid_of_match),
-	},
-
-	.probe		= i2c_hid_probe,
-	.remove		= i2c_hid_remove,
-	.shutdown	= i2c_hid_shutdown,
-	.id_table	= i2c_hid_id_table,
-};
-
-module_i2c_driver(i2c_hid_driver);
+EXPORT_SYMBOL_GPL(i2c_hid_core_pm);
 
 MODULE_DESCRIPTION("HID over I2C core driver");
 MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
diff --git a/drivers/hid/i2c-hid/i2c-hid-of.c b/drivers/hid/i2c-hid/i2c-hid-of.c
new file mode 100644
index 000000000000..4bf7cea92637
--- /dev/null
+++ b/drivers/hid/i2c-hid/i2c-hid-of.c
@@ -0,0 +1,143 @@
+/*
+ * HID over I2C Open Firmware Subclass
+ *
+ * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
+ * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
+ * Copyright (c) 2012 Red Hat, Inc
+ *
+ * This code was forked out of the core code, which was partly based on
+ * "USB HID support for Linux":
+ *
+ *  Copyright (c) 1999 Andreas Gal
+ *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
+ *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
+ *  Copyright (c) 2007-2008 Oliver Neukum
+ *  Copyright (c) 2006-2010 Jiri Kosina
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/pm.h>
+#include <linux/regulator/consumer.h>
+
+#include "i2c-hid.h"
+
+struct i2c_hid_of {
+	struct i2chid_ops ops;
+
+	struct i2c_client *client;
+	struct regulator_bulk_data supplies[2];
+	int post_power_delay_ms;
+};
+
+static int i2c_hid_of_power_up(struct i2chid_ops *ops)
+{
+	struct i2c_hid_of *ihid_of = container_of(ops, struct i2c_hid_of, ops);
+	struct device *dev = &ihid_of->client->dev;
+	int ret;
+
+	ret = regulator_bulk_enable(ARRAY_SIZE(ihid_of->supplies),
+				    ihid_of->supplies);
+	if (ret) {
+		dev_warn(dev, "Failed to enable supplies: %d\n", ret);
+		return ret;
+	}
+
+	if (ihid_of->post_power_delay_ms)
+		msleep(ihid_of->post_power_delay_ms);
+
+	return 0;
+}
+
+static void i2c_hid_of_power_down(struct i2chid_ops *ops)
+{
+	struct i2c_hid_of *ihid_of = container_of(ops, struct i2c_hid_of, ops);
+
+	regulator_bulk_disable(ARRAY_SIZE(ihid_of->supplies),
+			       ihid_of->supplies);
+}
+
+static int i2c_hid_of_probe(struct i2c_client *client,
+			    const struct i2c_device_id *dev_id)
+{
+	struct device *dev = &client->dev;
+	struct i2c_hid_of *ihid_of;
+	u16 hid_descriptor_address;
+	int ret;
+	u32 val;
+
+	ihid_of = devm_kzalloc(&client->dev, sizeof(*ihid_of), GFP_KERNEL);
+	if (!ihid_of)
+		return -ENOMEM;
+
+	ihid_of->ops.power_up = i2c_hid_of_power_up;
+	ihid_of->ops.power_down = i2c_hid_of_power_down;
+
+	ret = of_property_read_u32(dev->of_node, "hid-descr-addr", &val);
+	if (ret) {
+		dev_err(&client->dev, "HID register address not provided\n");
+		return -ENODEV;
+	}
+	if (val >> 16) {
+		dev_err(&client->dev, "Bad HID register address: 0x%08x\n",
+			val);
+		return -EINVAL;
+	}
+	hid_descriptor_address = val;
+
+	if (!device_property_read_u32(&client->dev, "post-power-on-delay-ms",
+				      &val))
+		ihid_of->post_power_delay_ms = val;
+
+	ihid_of->supplies[0].supply = "vdd";
+	ihid_of->supplies[1].supply = "vddl";
+	ret = devm_regulator_bulk_get(&client->dev,
+				      ARRAY_SIZE(ihid_of->supplies),
+				      ihid_of->supplies);
+	if (ret)
+		return ret;
+
+	return i2c_hid_core_probe(client, &ihid_of->ops,
+				  hid_descriptor_address);
+}
+
+static const struct of_device_id i2c_hid_of_match[] = {
+	{ .compatible = "hid-over-i2c" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_hid_of_match);
+
+static const struct i2c_device_id i2c_hid_of_id_table[] = {
+	{ "hid", 0 },
+	{ "hid-over-i2c", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, i2c_hid_of_id_table);
+
+static struct i2c_driver i2c_hid_of_driver = {
+	.driver = {
+		.name	= "i2c_hid_of",
+		.pm	= &i2c_hid_core_pm,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+		.of_match_table = of_match_ptr(i2c_hid_of_match),
+	},
+
+	.probe		= i2c_hid_of_probe,
+	.remove		= i2c_hid_core_remove,
+	.shutdown	= i2c_hid_core_shutdown,
+	.id_table	= i2c_hid_of_id_table,
+};
+
+module_i2c_driver(i2c_hid_of_driver);
+
+MODULE_DESCRIPTION("HID over I2C OF driver");
+MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/i2c-hid/i2c-hid.h b/drivers/hid/i2c-hid/i2c-hid.h
index a8c19aef5824..05a7827d211a 100644
--- a/drivers/hid/i2c-hid/i2c-hid.h
+++ b/drivers/hid/i2c-hid/i2c-hid.h
@@ -3,6 +3,7 @@
 #ifndef I2C_HID_H
 #define I2C_HID_H
 
+#include <linux/i2c.h>
 
 #ifdef CONFIG_DMI
 struct i2c_hid_desc *i2c_hid_get_dmi_i2c_hid_desc_override(uint8_t *i2c_name);
@@ -17,4 +18,25 @@ static inline char *i2c_hid_get_dmi_hid_report_desc_override(uint8_t *i2c_name,
 { return NULL; }
 #endif
 
+/**
+ * struct i2chid_ops - Ops provided to the core.
+ *
+ * @power_up: do sequencing to power up the device.
+ * @power_down: do sequencing to power down the device.
+ * @shutdown_tail: called at the end of shutdown.
+ */
+struct i2chid_ops {
+	int (*power_up)(struct i2chid_ops *ops);
+	void (*power_down)(struct i2chid_ops *ops);
+	void (*shutdown_tail)(struct i2chid_ops *ops);
+};
+
+int i2c_hid_core_probe(struct i2c_client *client, struct i2chid_ops *ops,
+		       u16 hid_descriptor_address);
+int i2c_hid_core_remove(struct i2c_client *client);
+
+void i2c_hid_core_shutdown(struct i2c_client *client);
+
+extern const struct dev_pm_ops i2c_hid_core_pm;
+
 #endif
diff --git a/include/linux/platform_data/i2c-hid.h b/include/linux/platform_data/i2c-hid.h
deleted file mode 100644
index c628bb5e1061..000000000000
--- a/include/linux/platform_data/i2c-hid.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * HID over I2C protocol implementation
- *
- * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
- * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive for
- * more details.
- */
-
-#ifndef __LINUX_I2C_HID_H
-#define __LINUX_I2C_HID_H
-
-#include <linux/regulator/consumer.h>
-#include <linux/types.h>
-
-/**
- * struct i2chid_platform_data - used by hid over i2c implementation.
- * @hid_descriptor_address: i2c register where the HID descriptor is stored.
- * @supplies: regulators for powering on the device.
- * @post_power_delay_ms: delay after powering on before device is usable.
- *
- * Note that it is the responsibility of the platform driver (or the acpi 5.0
- * driver, or the flattened device tree) to setup the irq related to the gpio in
- * the struct i2c_board_info.
- * The platform driver should also setup the gpio according to the device:
- *
- * A typical example is the following:
- *	irq = gpio_to_irq(intr_gpio);
- *	hkdk4412_i2c_devs5[0].irq = irq; // store the irq in i2c_board_info
- *	gpio_request(intr_gpio, "elan-irq");
- *	s3c_gpio_setpull(intr_gpio, S3C_GPIO_PULL_UP);
- */
-struct i2c_hid_platform_data {
-	u16 hid_descriptor_address;
-	struct regulator_bulk_data supplies[2];
-	int post_power_delay_ms;
-};
-
-#endif /* __LINUX_I2C_HID_H */
-- 
cgit v1.2.3


From f6f1f8e6e3eea25f539105d48166e91f0ab46dd1 Mon Sep 17 00:00:00 2001
From: Aswath Govindraju <a-govindraju@ti.com>
Date: Tue, 5 Jan 2021 16:28:12 +0530
Subject: misc: eeprom_93xx46: Add quirk to support Microchip 93LC46B eeprom

A dummy zero bit is sent preceding the data during a read transfer by the
Microchip 93LC46B eeprom (section 2.7 of[1]). This results in right shift
of data during a read. In order to ignore this bit a quirk can be added to
send an extra zero bit after the read address.

Add a quirk to ignore the zero bit sent before data by adding a zero bit
after the read address.

[1] - https://www.mouser.com/datasheet/2/268/20001749K-277859.pdf

Signed-off-by: Aswath Govindraju <a-govindraju@ti.com>
Link: https://lore.kernel.org/r/20210105105817.17644-3-a-govindraju@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/eeprom/eeprom_93xx46.c | 15 +++++++++++++++
 include/linux/eeprom_93xx46.h       |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/misc/eeprom/eeprom_93xx46.c b/drivers/misc/eeprom/eeprom_93xx46.c
index e130a0d858e9..80114f4c80ad 100644
--- a/drivers/misc/eeprom/eeprom_93xx46.c
+++ b/drivers/misc/eeprom/eeprom_93xx46.c
@@ -35,6 +35,10 @@ static const struct eeprom_93xx46_devtype_data atmel_at93c46d_data = {
 		  EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH,
 };
 
+static const struct eeprom_93xx46_devtype_data microchip_93lc46b_data = {
+	.quirks = EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE,
+};
+
 struct eeprom_93xx46_dev {
 	struct spi_device *spi;
 	struct eeprom_93xx46_platform_data *pdata;
@@ -55,6 +59,11 @@ static inline bool has_quirk_instruction_length(struct eeprom_93xx46_dev *edev)
 	return edev->pdata->quirks & EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH;
 }
 
+static inline bool has_quirk_extra_read_cycle(struct eeprom_93xx46_dev *edev)
+{
+	return edev->pdata->quirks & EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE;
+}
+
 static int eeprom_93xx46_read(void *priv, unsigned int off,
 			      void *val, size_t count)
 {
@@ -96,6 +105,11 @@ static int eeprom_93xx46_read(void *priv, unsigned int off,
 		dev_dbg(&edev->spi->dev, "read cmd 0x%x, %d Hz\n",
 			cmd_addr, edev->spi->max_speed_hz);
 
+		if (has_quirk_extra_read_cycle(edev)) {
+			cmd_addr <<= 1;
+			bits += 1;
+		}
+
 		spi_message_init(&m);
 
 		t[0].tx_buf = (char *)&cmd_addr;
@@ -363,6 +377,7 @@ static void select_deassert(void *context)
 static const struct of_device_id eeprom_93xx46_of_table[] = {
 	{ .compatible = "eeprom-93xx46", },
 	{ .compatible = "atmel,at93c46d", .data = &atmel_at93c46d_data, },
+	{ .compatible = "microchip,93lc46b", .data = &microchip_93lc46b_data, },
 	{}
 };
 MODULE_DEVICE_TABLE(of, eeprom_93xx46_of_table);
diff --git a/include/linux/eeprom_93xx46.h b/include/linux/eeprom_93xx46.h
index eec7928ff8fe..99580c22f91a 100644
--- a/include/linux/eeprom_93xx46.h
+++ b/include/linux/eeprom_93xx46.h
@@ -16,6 +16,8 @@ struct eeprom_93xx46_platform_data {
 #define EEPROM_93XX46_QUIRK_SINGLE_WORD_READ		(1 << 0)
 /* Instructions such as EWEN are (addrlen + 2) in length. */
 #define EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH		(1 << 1)
+/* Add extra cycle after address during a read */
+#define EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE		BIT(2)
 
 	/*
 	 * optional hooks to control additional logic
-- 
cgit v1.2.3


From fcba4b2047a31a55e1cef73849363a3cf7c2736d Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Tue, 5 Jan 2021 17:44:35 +0100
Subject: mhi: unconstify mhi_event_config

Some parameters may have to be determined at runtime.
It is the case for the event ring MSI vector.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 54afcae1709a..48f5c9168ae0 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -279,7 +279,7 @@ struct mhi_controller_config {
 	u32 num_channels;
 	const struct mhi_channel_config *ch_cfg;
 	u32 num_events;
-	const struct mhi_event_config *event_cfg;
+	struct mhi_event_config *event_cfg;
 	bool use_bounce_buf;
 	bool m2_no_db;
 };
-- 
cgit v1.2.3


From 6ffcc18d9c0b4522c95aab71ff3ff5a56e699945 Mon Sep 17 00:00:00 2001
From: Carl Huang <cjhuang@codeaurora.org>
Date: Mon, 4 Jan 2021 18:11:28 +0800
Subject: mhi: use irq_flags if controller driver configures it

If controller driver has specified the irq_flags, mhi uses this specified
irq_flags. Otherwise, mhi uses default irq_flags.

The purpose of this change is to support one MSI vector for QCA6390.
MHI will use one same MSI vector too in this scenario.

In case of one MSI vector, IRQ_NO_BALANCING is needed when irq handler
is requested. The reason is if irq migration happens, the msi_data may
change too. However, the msi_data is already programmed to QCA6390
hardware during initialization phase. This msi_data inconsistence will
result in crash in kernel.

Another issue is in case of one MSI vector, IRQF_NO_SUSPEND will trigger
WARNINGS because QCA6390 wants to disable the IRQ during the suspend.

To avoid above two issues, QCA6390 driver specifies the irq_flags in case
of one MSI vector when mhi_register_controller is called.

Signed-off-by: Carl Huang <cjhuang@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c | 9 +++++++--
 include/linux/mhi.h         | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index f0697f433c2f..aa575d3fb3ae 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -151,12 +151,17 @@ int mhi_init_irq_setup(struct mhi_controller *mhi_cntrl)
 {
 	struct mhi_event *mhi_event = mhi_cntrl->mhi_event;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	unsigned long irq_flags = IRQF_SHARED | IRQF_NO_SUSPEND;
 	int i, ret;
 
+	/* if controller driver has set irq_flags, use it */
+	if (mhi_cntrl->irq_flags)
+		irq_flags = mhi_cntrl->irq_flags;
+
 	/* Setup BHI_INTVEC IRQ */
 	ret = request_threaded_irq(mhi_cntrl->irq[0], mhi_intvec_handler,
 				   mhi_intvec_threaded_handler,
-				   IRQF_SHARED | IRQF_NO_SUSPEND,
+				   irq_flags,
 				   "bhi", mhi_cntrl);
 	if (ret)
 		return ret;
@@ -174,7 +179,7 @@ int mhi_init_irq_setup(struct mhi_controller *mhi_cntrl)
 
 		ret = request_irq(mhi_cntrl->irq[mhi_event->irq],
 				  mhi_irq_handler,
-				  IRQF_SHARED | IRQF_NO_SUSPEND,
+				  irq_flags,
 				  "mhi", mhi_event);
 		if (ret) {
 			dev_err(dev, "Error requesting irq:%d for ev:%d\n",
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 562862ff819c..b7138127664f 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -353,6 +353,7 @@ struct mhi_controller_config {
  * @fbc_download: MHI host needs to do complete image transfer (optional)
  * @pre_init: MHI host needs to do pre-initialization before power up
  * @wake_set: Device wakeup set flag
+ * @irq_flags: irq flags passed to request_irq (optional)
  *
  * Fields marked as (required) need to be populated by the controller driver
  * before calling mhi_register_controller(). For the fields marked as (optional)
@@ -444,6 +445,7 @@ struct mhi_controller {
 	bool fbc_download;
 	bool pre_init;
 	bool wake_set;
+	unsigned long irq_flags;
 };
 
 /**
-- 
cgit v1.2.3


From c52b7c807b0a6ae26582208a0b07c2a6a796b50f Mon Sep 17 00:00:00 2001
From: "Alexander A. Klimov" <grandmaster@al2klimov.de>
Date: Thu, 16 Jul 2020 21:52:27 +0200
Subject: encrypted-keys: Replace HTTP links with HTTPS ones

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/keys/encrypted-type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/keys/encrypted-type.h b/include/keys/encrypted-type.h
index 38afb341c3f2..abfcbe02001a 100644
--- a/include/keys/encrypted-type.h
+++ b/include/keys/encrypted-type.h
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2010 IBM Corporation
  * Copyright (C) 2010 Politecnico di Torino, Italy
- *                    TORSEC group -- http://security.polito.it
+ *                    TORSEC group -- https://security.polito.it
  *
  * Authors:
  * Mimi Zohar <zohar@us.ibm.com>
-- 
cgit v1.2.3


From 464e96aeb16ab4e071d353f69ebbddfa08c8d731 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 27 Nov 2020 11:15:43 -0800
Subject: keys: remove trailing semicolon in macro definition

The macro use will already have a semicolon.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/linux/key.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/key.h b/include/linux/key.h
index 0f2e24f13c2b..1b0837c975b9 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -360,7 +360,7 @@ static inline struct key *request_key(struct key_type *type,
  * completion of keys undergoing construction with a non-interruptible wait.
  */
 #define request_key_net(type, description, net, callout_info) \
-	request_key_tag(type, description, net->key_domain, callout_info);
+	request_key_tag(type, description, net->key_domain, callout_info)
 
 /**
  * request_key_net_rcu - Request a key for a net namespace under RCU conditions
@@ -372,7 +372,7 @@ static inline struct key *request_key(struct key_type *type,
  * network namespace are used.
  */
 #define request_key_net_rcu(type, description, net) \
-	request_key_rcu(type, description, net->key_domain);
+	request_key_rcu(type, description, net->key_domain)
 #endif /* CONFIG_NET */
 
 extern int wait_for_key_construction(struct key *key, bool intr);
-- 
cgit v1.2.3


From 09315b2d0d6944b9e249003c04abb88b5594a683 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Wed, 18 Nov 2020 20:30:31 +0800
Subject: crypto: public_key: Remove redundant header file from public_key.h

The akcipher.h header file was originally introduced in SM2, and
then the definition of SM2 was moved to the existing code. This
header file is left and should be removed.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/crypto/public_key.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 948c5203ca9c..47accec68cb0 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -12,7 +12,6 @@
 
 #include <linux/keyctl.h>
 #include <linux/oid_registry.h>
-#include <crypto/akcipher.h>
 
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
-- 
cgit v1.2.3


From f14602caf4faef18999985bc87a414b552844ad2 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Fri, 20 Nov 2020 19:04:22 +0100
Subject: PKCS#7: Fix missing include
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing linux/types.h for size_t.

[DH: Changed from stddef.h]

Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/linux/verification.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/verification.h b/include/linux/verification.h
index 911ab7c2b1ab..a655923335ae 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -8,6 +8,8 @@
 #ifndef _LINUX_VERIFICATION_H
 #define _LINUX_VERIFICATION_H
 
+#include <linux/types.h>
+
 /*
  * Indicate that both builtin trusted keys and secondary trusted keys
  * should be used.
-- 
cgit v1.2.3


From 4993e1f9479a4161fd7d93e2b8b30b438f00cb0f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Nov 2020 19:04:23 +0100
Subject: certs: Fix blacklist flag type confusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KEY_FLAG_KEEP is not meant to be passed to keyring_alloc() or key_alloc(),
as these only take KEY_ALLOC_* flags.  KEY_FLAG_KEEP has the same value as
KEY_ALLOC_BYPASS_RESTRICTION, but fortunately only key_create_or_update()
uses it.  LSMs using the key_alloc hook don't check that flag.

KEY_FLAG_KEEP is then ignored but fortunately (again) the root user cannot
write to the blacklist keyring, so it is not possible to remove a key/hash
from it.

Fix this by adding a KEY_ALLOC_SET_KEEP flag that tells key_alloc() to set
KEY_FLAG_KEEP on the new key.  blacklist_init() can then, correctly, pass
this to keyring_alloc().

We can also use this in ima_mok_init() rather than setting the flag
manually.

Note that this doesn't fix an observable bug with the current
implementation but it is required to allow addition of new hashes to the
blacklist in the future without making it possible for them to be removed.

Fixes: 734114f8782f ("KEYS: Add a system blacklist keyring")
Reported-by: Mickaël Salaün <mic@linux.microsoft.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Mickaël Salaün <mic@linux.microsoft.com>
cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: David Woodhouse <dwmw2@infradead.org>
---
 certs/blacklist.c                | 2 +-
 include/linux/key.h              | 1 +
 security/integrity/ima/ima_mok.c | 5 ++---
 security/keys/key.c              | 2 ++
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/certs/blacklist.c b/certs/blacklist.c
index a888b934a1cd..029471947838 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -162,7 +162,7 @@ static int __init blacklist_init(void)
 			      KEY_USR_VIEW | KEY_USR_READ |
 			      KEY_USR_SEARCH,
 			      KEY_ALLOC_NOT_IN_QUOTA |
-			      KEY_FLAG_KEEP,
+			      KEY_ALLOC_SET_KEEP,
 			      NULL, NULL);
 	if (IS_ERR(blacklist_keyring))
 		panic("Can't allocate system blacklist keyring\n");
diff --git a/include/linux/key.h b/include/linux/key.h
index 1b0837c975b9..7febc4881363 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -289,6 +289,7 @@ extern struct key *key_alloc(struct key_type *type,
 #define KEY_ALLOC_BUILT_IN		0x0004	/* Key is built into kernel */
 #define KEY_ALLOC_BYPASS_RESTRICTION	0x0008	/* Override the check on restricted keyrings */
 #define KEY_ALLOC_UID_KEYRING		0x0010	/* allocating a user or user session keyring */
+#define KEY_ALLOC_SET_KEEP		0x0020	/* Set the KEEP flag on the key/keyring */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 36cadadbfba4..1e5c01916173 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -38,13 +38,12 @@ __init int ima_mok_init(void)
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
-				KEY_ALLOC_NOT_IN_QUOTA,
+				KEY_ALLOC_NOT_IN_QUOTA |
+				KEY_ALLOC_SET_KEEP,
 				restriction, NULL);
 
 	if (IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA blacklist keyring.");
-
-	set_bit(KEY_FLAG_KEEP, &ima_blacklist_keyring->flags);
 	return 0;
 }
 device_initcall(ima_mok_init);
diff --git a/security/keys/key.c b/security/keys/key.c
index ebe752b137aa..c45afdd1dfbb 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -303,6 +303,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		key->flags |= 1 << KEY_FLAG_BUILTIN;
 	if (flags & KEY_ALLOC_UID_KEYRING)
 		key->flags |= 1 << KEY_FLAG_UID_KEYRING;
+	if (flags & KEY_ALLOC_SET_KEEP)
+		key->flags |= 1 << KEY_FLAG_KEEP;
 
 #ifdef KEY_DEBUGGING
 	key->magic = KEY_DEBUG_MAGIC;
-- 
cgit v1.2.3


From a6435940b62f81a1718bf2bd46a051379fc89b9d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:20 +0100
Subject: mount: attach mappings to mounts

In order to support per-mount idmappings vfsmounts are marked with user
namespaces. The idmapping of the user namespace will be used to map the
ids of vfs objects when they are accessed through that mount. By default
all vfsmounts are marked with the initial user namespace. The initial
user namespace is used to indicate that a mount is not idmapped. All
operations behave as before.

Based on prior discussions we want to attach the whole user namespace
and not just a dedicated idmapping struct. This allows us to reuse all
the helpers that already exist for dealing with idmappings instead of
introducing a whole new range of helpers. In addition, if we decide in
the future that we are confident enough to enable unprivileged users to
setup idmapped mounts the permission checking can take into account
whether the caller is privileged in the user namespace the mount is
currently marked with.
Later patches enforce that once a mount has been idmapped it can't be
remapped. This keeps permission checking and life-cycle management
simple. Users wanting to change the idmapped can always create a new
detached mount with a different idmapping.

Add a new mnt_userns member to vfsmount and two simple helpers to
retrieve the mnt_userns from vfsmounts and files.

The idea to attach user namespaces to vfsmounts has been floated around
in various forms at Linux Plumbers in ~2018 with the original idea
tracing back to a discussion in 2017 at a conference in St. Petersburg
between Christoph, Tycho, and myself.

Link: https://lore.kernel.org/r/20210121131959.646623-2-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namespace.c        | 9 +++++++++
 include/linux/fs.h    | 6 ++++++
 include/linux/mount.h | 6 ++++++
 3 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 9d33909d0f9e..ecdc63ef881c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -210,6 +210,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		mnt->mnt.mnt_userns = &init_user_ns;
 	}
 	return mnt;
 
@@ -547,6 +548,11 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+	struct user_namespace *mnt_userns;
+
+	mnt_userns = mnt_user_ns(&mnt->mnt);
+	if (mnt_userns != &init_user_ns)
+		put_user_ns(mnt_userns);
 	kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -1055,6 +1061,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
+	mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
+	if (mnt->mnt.mnt_userns != &init_user_ns)
+		mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..fd0b80e6361d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/mount.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -2231,6 +2232,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
+#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
 #define FS_THP_SUPPORT		8192	/* Remove once all fs converted */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
@@ -2517,6 +2519,10 @@ struct filename {
 };
 static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
 
+static inline struct user_namespace *file_mnt_user_ns(struct file *file)
+{
+	return mnt_user_ns(file->f_path.mnt);
+}
 extern long vfs_truncate(const struct path *, loff_t);
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aaf343b38671..52de25e08319 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -72,8 +72,14 @@ struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
 	struct super_block *mnt_sb;	/* pointer to superblock */
 	int mnt_flags;
+	struct user_namespace *mnt_userns;
 } __randomize_layout;
 
+static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+	return mnt->mnt_userns;
+}
+
 struct file; /* forward dec */
 struct path;
 
-- 
cgit v1.2.3


From e6c9a71451560edba343cbcbd500bea0a188f0d1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:21 +0100
Subject: fs: add id translation helpers

Add simple helpers to make it easy to map kuids into and from idmapped
mounts. We provide simple wrappers that filesystems can use to e.g.
initialize inodes similar to i_{uid,gid}_read() and i_{uid,gid}_write().
Accessing an inode through an idmapped mount maps the i_uid and i_gid of
the inode to the mount's user namespace. If the fsids are used to
initialize inodes they are unmapped according to the mount's user
namespace. Passing the initial user namespace to these helpers makes
them a nop and so any non-idmapped paths will not be impacted.

Link: https://lore.kernel.org/r/20210121131959.646623-3-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd0b80e6361d..3165998e2294 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -40,6 +40,7 @@
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 #include <linux/mount.h>
+#include <linux/cred.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1573,6 +1574,52 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return make_kuid(mnt_userns, __kuid_val(kuid));
+}
+
+static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return make_kgid(mnt_userns, __kgid_val(kgid));
+}
+
+static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
+				    const struct inode *inode)
+{
+	return kuid_into_mnt(mnt_userns, inode->i_uid);
+}
+
+static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
+				    const struct inode *inode)
+{
+	return kgid_into_mnt(mnt_userns, inode->i_gid);
+}
+
+static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
+}
+
+static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
+}
+
+static inline kuid_t fsuid_into_mnt(struct user_namespace *mnt_userns)
+{
+	return kuid_from_mnt(mnt_userns, current_fsuid());
+}
+
+static inline kgid_t fsgid_into_mnt(struct user_namespace *mnt_userns)
+{
+	return kgid_from_mnt(mnt_userns, current_fsgid());
+}
+
 extern struct timespec64 current_time(struct inode *inode);
 
 /*
-- 
cgit v1.2.3


From 02f92b3868a1b34ab98464e76b0e4e060474ba10 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:22 +0100
Subject: fs: add file and path permissions helpers

Add two simple helpers to check permissions on a file and path
respectively and convert over some callers. It simplifies quite a few
codepaths and also reduces the churn in later patches quite a bit.
Christoph also correctly points out that this makes codepaths (e.g.
ioctls) way easier to follow that would otherwise have to do more
complex argument passing than necessary.

Link: https://lore.kernel.org/r/20210121131959.646623-4-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/init.c                          | 6 +++---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 fs/notify/inotify/inotify_user.c   | 2 +-
 fs/open.c                          | 6 +++---
 fs/udf/file.c                      | 2 +-
 fs/verity/enable.c                 | 2 +-
 include/linux/fs.h                 | 8 ++++++++
 kernel/bpf/inode.c                 | 2 +-
 kernel/sys.c                       | 2 +-
 mm/madvise.c                       | 2 +-
 mm/memcontrol.c                    | 2 +-
 mm/mincore.c                       | 2 +-
 net/unix/af_unix.c                 | 2 +-
 13 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/init.c b/fs/init.c
index e9c320a48cf1..02723bea8499 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -49,7 +49,7 @@ int __init init_chdir(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (error)
 		return error;
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &path);
 	path_put(&path);
@@ -64,7 +64,7 @@ int __init init_chroot(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (error)
 		return error;
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 	error = -EPERM;
@@ -118,7 +118,7 @@ int __init init_eaccess(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW, &path);
 	if (error)
 		return error;
-	error = inode_permission(d_inode(path.dentry), MAY_ACCESS);
+	error = path_permission(&path, MAY_ACCESS);
 	path_put(&path);
 	return error;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index dcab112e1f00..64cfc1a3015d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -702,7 +702,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 	}
 
 	/* you can only watch an inode if you have read permissions on it */
-	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	ret = path_permission(path, MAY_READ);
 	if (ret) {
 		path_put(path);
 		goto out;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 59c177011a0f..e1155d32ef6f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -352,7 +352,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path,
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	error = path_permission(path, MAY_READ);
 	if (error) {
 		path_put(path);
 		return error;
diff --git a/fs/open.c b/fs/open.c
index 1e06e443a565..cd1efd254cad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -492,7 +492,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -521,7 +521,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	if (!d_can_lookup(f.file->f_path.dentry))
 		goto out_putf;
 
-	error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+	error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &f.file->f_path);
 out_putf:
@@ -540,7 +540,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index ad8eefad27d7..3671a40ed3c3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -183,7 +183,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	long old_block, new_block;
 	int result;
 
-	if (inode_permission(inode, MAY_READ) != 0) {
+	if (file_permission(filp, MAY_READ) != 0) {
 		udf_debug("no permission to access inode %lu\n", inode->i_ino);
 		return -EPERM;
 	}
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index f7e997a01ad0..77e159a0346b 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
 	 * has verity enabled, and to stabilize the data being hashed.
 	 */
 
-	err = inode_permission(inode, MAY_WRITE);
+	err = file_permission(filp, MAY_WRITE);
 	if (err)
 		return err;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3165998e2294..bcd17097d441 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2812,6 +2812,14 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int);
+static inline int file_permission(struct file *file, int mask)
+{
+	return inode_permission(file_inode(file), mask);
+}
+static inline int path_permission(const struct path *path, int mask)
+{
+	return inode_permission(d_inode(path->dentry), mask);
+}
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
 static inline bool execute_ok(struct inode *inode)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dd4b7fd60ee7..8962f139521e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -507,7 +507,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, ACC_MODE(flags));
+	ret = path_permission(&path, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..138fb253b344 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1848,7 +1848,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
 		goto exit;
 
-	err = inode_permission(inode, MAY_EXEC);
+	err = file_permission(exe.file, MAY_EXEC);
 	if (err)
 		goto exit;
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 6a660858784b..175c5582d8a9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -540,7 +540,7 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
 	 * opens a side channel.
 	 */
 	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
-		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
 static long madvise_pageout(struct vm_area_struct *vma,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 605f671203ef..cf9076f58582 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4899,7 +4899,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 
 	/* the process need read permission on control file */
 	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = inode_permission(file_inode(cfile.file), MAY_READ);
+	ret = file_permission(cfile.file, MAY_READ);
 	if (ret < 0)
 		goto out_put_cfile;
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 02db1a834021..7bdb4673f776 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -167,7 +167,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
 	 * mappings, which opens a side channel.
 	 */
 	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
-		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
 static const struct mm_walk_ops mincore_walk_ops = {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 41c3303c3357..18453d15dddf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -936,7 +936,7 @@ static struct sock *unix_find_other(struct net *net,
 		if (err)
 			goto fail;
 		inode = d_backing_inode(path.dentry);
-		err = inode_permission(inode, MAY_WRITE);
+		err = path_permission(&path, MAY_WRITE);
 		if (err)
 			goto put_fail;
 
-- 
cgit v1.2.3


From 0558c1bf5a0811bf5e3753eed911a15b9bd08271 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:23 +0100
Subject: capability: handle idmapped mounts

In order to determine whether a caller holds privilege over a given
inode the capability framework exposes the two helpers
privileged_wrt_inode_uidgid() and capable_wrt_inode_uidgid(). The former
verifies that the inode has a mapping in the caller's user namespace and
the latter additionally verifies that the caller has the requested
capability in their current user namespace.
If the inode is accessed through an idmapped mount map it into the
mount's user namespace. Afterwards the checks are identical to
non-idmapped inodes. If the initial user namespace is passed all
operations are a nop so non-idmapped mounts will not see a change in
behavior.

Link: https://lore.kernel.org/r/20210121131959.646623-5-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                  |  8 ++++----
 fs/exec.c                  |  3 ++-
 fs/inode.c                 |  3 ++-
 fs/namei.c                 | 13 ++++++++-----
 fs/overlayfs/super.c       |  2 +-
 fs/posix_acl.c             |  2 +-
 fs/xfs/xfs_ioctl.c         |  2 +-
 include/linux/capability.h |  7 +++++--
 kernel/capability.c        | 14 +++++++++-----
 security/commoncap.c       |  5 +++--
 10 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd4c8ca..d270f640a192 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -23,7 +23,7 @@ static bool chown_ok(const struct inode *inode, kuid_t uid)
 	if (uid_eq(current_fsuid(), inode->i_uid) &&
 	    uid_eq(uid, inode->i_uid))
 		return true;
-	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
 		return true;
 	if (uid_eq(inode->i_uid, INVALID_UID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -36,7 +36,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
 	if (uid_eq(current_fsuid(), inode->i_uid) &&
 	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
 		return true;
-	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
 		return true;
 	if (gid_eq(inode->i_gid, INVALID_GID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -92,7 +92,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -193,7 +193,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 		umode_t mode = attr->ia_mode;
 
 		if (!in_group_p(inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index 5d4d52039105..89d4780ff48f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1411,7 +1411,8 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
 		/* Ensure mm->user_ns contains the executable */
 		user_ns = old = bprm->mm->user_ns;
 		while ((user_ns != &init_user_ns) &&
-		       !privileged_wrt_inode_uidgid(user_ns, inode))
+		       !privileged_wrt_inode_uidgid(user_ns, &init_user_ns,
+						    inode))
 			user_ns = user_ns->parent;
 
 		if (old != user_ns) {
diff --git a/fs/inode.c b/fs/inode.c
index 6442d97d9a4a..cd40cbf87ce4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2146,7 +2146,8 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 			mode |= S_ISGID;
 		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
 			 !in_group_p(inode->i_gid) &&
-			 !capable_wrt_inode_uidgid(dir, CAP_FSETID))
+			 !capable_wrt_inode_uidgid(&init_user_ns, dir,
+						   CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
 		inode->i_gid = current_fsgid();
diff --git a/fs/namei.c b/fs/namei.c
index 78443a85480a..fd4724bce4f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -357,10 +357,11 @@ int generic_permission(struct inode *inode, int mask)
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
 		if (!(mask & MAY_WRITE))
-			if (capable_wrt_inode_uidgid(inode,
+			if (capable_wrt_inode_uidgid(&init_user_ns, inode,
 						     CAP_DAC_READ_SEARCH))
 				return 0;
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_OVERRIDE))
 			return 0;
 		return -EACCES;
 	}
@@ -370,7 +371,8 @@ int generic_permission(struct inode *inode, int mask)
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ)
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_READ_SEARCH))
 			return 0;
 	/*
 	 * Read/write DACs are always overridable.
@@ -378,7 +380,8 @@ int generic_permission(struct inode *inode, int mask)
 	 * at least one exec bit set.
 	 */
 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_OVERRIDE))
 			return 0;
 
 	return -EACCES;
@@ -2659,7 +2662,7 @@ int __check_sticky(struct inode *dir, struct inode *inode)
 		return 0;
 	if (uid_eq(dir->i_uid, fsuid))
 		return 0;
-	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+	return !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FOWNER);
 }
 EXPORT_SYMBOL(__check_sticky);
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 2bd570cbe8a4..88d877787770 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1017,7 +1017,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 	if (unlikely(inode->i_mode & S_ISGID) &&
 	    handler->flags == ACL_TYPE_ACCESS &&
 	    !in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
+	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
 		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
 
 		err = ovl_setattr(dentry, &iattr);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 95882b3f5f62..4ca6d53c6f0a 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -656,7 +656,7 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
 	if (error == 0)
 		*acl = NULL;
 	if (!in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 		mode &= ~S_ISGID;
 	*mode_p = mode;
 	return 0;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3fbd98f61ea5..97bd29fc8c43 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1502,7 +1502,7 @@ xfs_ioctl_setattr(
 	 */
 
 	if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
-	    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+	    !capable_wrt_inode_uidgid(&init_user_ns, VFS_I(ip), CAP_FSETID))
 		VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
 
 	/* Change the ownerships and register project quota modifications */
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b2f698915c0f..62bfa3ed84d7 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -247,8 +247,11 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
 	return true;
 }
 #endif /* CONFIG_MULTIUSER */
-extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
-extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+				 struct user_namespace *mnt_userns,
+				 const struct inode *inode);
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+			      const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
 static inline bool perfmon_capable(void)
diff --git a/kernel/capability.c b/kernel/capability.c
index de7eac903a2a..46a361dde042 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -484,10 +484,12 @@ EXPORT_SYMBOL(file_ns_capable);
  *
  * Return true if the inode uid and gid are within the namespace.
  */
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+				 struct user_namespace *mnt_userns,
+				 const struct inode *inode)
 {
-	return kuid_has_mapping(ns, inode->i_uid) &&
-		kgid_has_mapping(ns, inode->i_gid);
+	return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
+	       kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
 }
 
 /**
@@ -499,11 +501,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *
  * its own user namespace and that the given inode's uid and gid are
  * mapped into the current user namespace.
  */
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+			      const struct inode *inode, int cap)
 {
 	struct user_namespace *ns = current_user_ns();
 
-	return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+	return ns_capable(ns, cap) &&
+	       privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
 
diff --git a/security/commoncap.c b/security/commoncap.c
index bacc1111d871..88ee345f7bfa 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -489,7 +489,7 @@ int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 		return -EINVAL;
 	if (!validheader(size, cap))
 		return -EINVAL;
-	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+	if (!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_SETFCAP))
 		return -EPERM;
 	if (size == XATTR_CAPS_SZ_2)
 		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
@@ -956,7 +956,8 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 		struct inode *inode = d_backing_inode(dentry);
 		if (!inode)
 			return -EINVAL;
-		if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+		if (!capable_wrt_inode_uidgid(&init_user_ns, inode,
+					      CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
-- 
cgit v1.2.3


From 47291baa8ddfdae10663624ff0a15ab165952708 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:24 +0100
Subject: namei: make permission helpers idmapped mount aware

The two helpers inode_permission() and generic_permission() are used by
the vfs to perform basic permission checking by verifying that the
caller is privileged over an inode. In order to handle idmapped mounts
we extend the two helpers with an additional user namespace argument.
On idmapped mounts the two helpers will make sure to map the inode
according to the mount's user namespace and then peform identical
permission checks to inode_permission() and generic_permission(). If the
initial user namespace is passed nothing changes so non-idmapped mounts
will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-6-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                 |   3 +-
 fs/btrfs/inode.c          |   2 +-
 fs/btrfs/ioctl.c          |  12 +++--
 fs/ceph/inode.c           |   2 +-
 fs/cifs/cifsfs.c          |   2 +-
 fs/configfs/symlink.c     |   3 +-
 fs/ecryptfs/inode.c       |   3 +-
 fs/exec.c                 |   2 +-
 fs/fuse/dir.c             |   5 +-
 fs/gfs2/inode.c           |   2 +-
 fs/hostfs/hostfs_kern.c   |   2 +-
 fs/kernfs/inode.c         |   2 +-
 fs/libfs.c                |   7 ++-
 fs/namei.c                | 121 ++++++++++++++++++++++++++++++++--------------
 fs/nfs/dir.c              |   2 +-
 fs/nfsd/nfsfh.c           |   3 +-
 fs/nfsd/vfs.c             |   5 +-
 fs/nilfs2/inode.c         |   2 +-
 fs/ocfs2/file.c           |   2 +-
 fs/ocfs2/refcounttree.c   |   4 +-
 fs/open.c                 |   4 +-
 fs/orangefs/inode.c       |   2 +-
 fs/overlayfs/file.c       |   2 +-
 fs/overlayfs/inode.c      |   4 +-
 fs/overlayfs/util.c       |   2 +-
 fs/posix_acl.c            |  17 +++++--
 fs/proc/base.c            |   4 +-
 fs/proc/fd.c              |   2 +-
 fs/reiserfs/xattr.c       |   2 +-
 fs/remap_range.c          |   2 +-
 fs/xattr.c                |   2 +-
 include/linux/fs.h        |  10 ++--
 include/linux/posix_acl.h |   7 ++-
 ipc/mqueue.c              |   2 +-
 kernel/bpf/inode.c        |   2 +-
 kernel/cgroup/cgroup.c    |   2 +-
 36 files changed, 164 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index d270f640a192..c9e29e589cec 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -244,7 +244,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 			return -EPERM;
 
 		if (!inode_owner_or_capable(inode)) {
-			error = inode_permission(inode, MAY_WRITE);
+			error = inode_permission(&init_user_ns, inode,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8e0a6b038d3..512ee2650bbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9889,7 +9889,7 @@ static int btrfs_permission(struct inode *inode, int mask)
 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 			return -EACCES;
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dde49a791f3e..8ced6dfefee4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -922,7 +922,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 	BUG_ON(d_inode(victim->d_parent) != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -951,7 +951,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2538,7 +2538,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
 				ret = PTR_ERR(temp_inode);
 				goto out_put;
 			}
-			ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+			ret = inode_permission(&init_user_ns, temp_inode,
+					       MAY_READ | MAY_EXEC);
 			iput(temp_inode);
 			if (ret) {
 				ret = -EACCES;
@@ -3068,7 +3069,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		if (root == dest)
 			goto out_dput;
 
-		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		err = inode_permission(&init_user_ns, inode,
+				       MAY_WRITE | MAY_EXEC);
 		if (err)
 			goto out_dput;
 	}
@@ -3139,7 +3141,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 		 * running and allows defrag on files open in read-only mode.
 		 */
 		if (!capable(CAP_SYS_ADMIN) &&
-		    inode_permission(inode, MAY_WRITE)) {
+		    inode_permission(&init_user_ns, inode, MAY_WRITE)) {
 			ret = -EPERM;
 			goto out;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index adc8fc3c5d85..e8a15ee09bc1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2331,7 +2331,7 @@ int ceph_permission(struct inode *inode, int mask)
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
 
 	if (!err)
-		err = generic_permission(inode, mask);
+		err = generic_permission(&init_user_ns, inode, mask);
 	return err;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ce0d0037fd0a..ce14e6f8adb6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -320,7 +320,7 @@ static int cifs_permission(struct inode *inode, int mask)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask);
+		return generic_permission(&init_user_ns, inode, mask);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index cb61467478ca..8ca36394fa30 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -197,7 +197,8 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	if (dentry->d_inode || d_unhashed(dentry))
 		ret = -EEXIST;
 	else
-		ret = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+		ret = inode_permission(&init_user_ns, dir,
+				       MAY_WRITE | MAY_EXEC);
 	if (!ret)
 		ret = type->ct_item_ops->allow_link(parent_item, target_item);
 	if (!ret) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e23752d9a79f..0b346baf110d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -864,7 +864,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
-	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+	return inode_permission(&init_user_ns,
+				ecryptfs_inode_to_lower(inode), mask);
 }
 
 /**
diff --git a/fs/exec.c b/fs/exec.c
index 89d4780ff48f..a8ec371cd3cd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1404,7 +1404,7 @@ EXPORT_SYMBOL(begin_new_exec);
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
 	struct inode *inode = file_inode(file);
-	if (inode_permission(inode, MAY_READ) < 0) {
+	if (inode_permission(&init_user_ns, inode, MAY_READ) < 0) {
 		struct user_namespace *old, *user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 78f9f209078c..7497009a5a45 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1280,7 +1280,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	}
 
 	if (fc->default_permissions) {
-		err = generic_permission(inode, mask);
+		err = generic_permission(&init_user_ns, inode, mask);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1288,7 +1288,8 @@ static int fuse_permission(struct inode *inode, int mask)
 		if (err == -EACCES && !refreshed) {
 			err = fuse_perm_getattr(inode, mask);
 			if (!err)
-				err = generic_permission(inode, mask);
+				err = generic_permission(&init_user_ns,
+							 inode, mask);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c1b77e8d6b1c..5b2ff0c74b67 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1852,7 +1852,7 @@ int gfs2_permission(struct inode *inode, int mask)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EPERM;
 	else
-		error = generic_permission(inode, mask);
+		error = generic_permission(&init_user_ns, inode, mask);
 	if (gfs2_holder_initialized(&i_gh))
 		gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index aea35459d390..b841a05a2b8c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -779,7 +779,7 @@ static int hostfs_permission(struct inode *ino, int desired)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired);
+		err = generic_permission(&init_user_ns, ino, desired);
 	return err;
 }
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index fc2469a20fed..ff5598cc1de0 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -285,7 +285,7 @@ int kernfs_iop_permission(struct inode *inode, int mask)
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/libfs.c b/fs/libfs.c
index d1c3bade9f30..f8b3c02b4f0f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1318,9 +1318,14 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
 	return -EOPNOTSUPP;
 }
 
+static int empty_dir_permission(struct inode *inode, int mask)
+{
+	return generic_permission(&init_user_ns, inode, mask);
+}
+
 static const struct inode_operations empty_dir_inode_operations = {
 	.lookup		= empty_dir_lookup,
-	.permission	= generic_permission,
+	.permission	= empty_dir_permission,
 	.setattr	= empty_dir_setattr,
 	.getattr	= empty_dir_getattr,
 	.listxattr	= empty_dir_listxattr,
diff --git a/fs/namei.c b/fs/namei.c
index fd4724bce4f5..d78d74f5f5af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -259,7 +259,24 @@ void putname(struct filename *name)
 		__putname(name);
 }
 
-static int check_acl(struct inode *inode, int mask)
+/**
+ * check_acl - perform ACL permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the ACL permission checking. Since this function
+ * retrieve POSIX acls it needs to know whether it is called from a blocking or
+ * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+static int check_acl(struct user_namespace *mnt_userns,
+		     struct inode *inode, int mask)
 {
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl *acl;
@@ -271,14 +288,14 @@ static int check_acl(struct inode *inode, int mask)
 		/* no ->get_acl() calls in RCU mode... */
 		if (is_uncached_acl(acl))
 			return -ECHILD;
-	        return posix_acl_permission(inode, acl, mask);
+	        return posix_acl_permission(mnt_userns, inode, acl, mask);
 	}
 
 	acl = get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
-	        int error = posix_acl_permission(inode, acl, mask);
+	        int error = posix_acl_permission(mnt_userns, inode, acl, mask);
 	        posix_acl_release(acl);
 	        return error;
 	}
@@ -287,18 +304,31 @@ static int check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-/*
- * This does the basic UNIX permission checking.
+/**
+ * acl_permission_check - perform basic UNIX permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the basic UNIX permission checking. Since this
+ * function may retrieve POSIX acls it needs to know whether it is called from a
+ * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
  *
- * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
- * for RCU walking.
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-static int acl_permission_check(struct inode *inode, int mask)
+static int acl_permission_check(struct user_namespace *mnt_userns,
+				struct inode *inode, int mask)
 {
 	unsigned int mode = inode->i_mode;
+	kuid_t i_uid;
 
 	/* Are we the owner? If so, ACL's don't matter */
-	if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
+	if (likely(uid_eq(current_fsuid(), i_uid))) {
 		mask &= 7;
 		mode >>= 6;
 		return (mask & ~mode) ? -EACCES : 0;
@@ -306,7 +336,7 @@ static int acl_permission_check(struct inode *inode, int mask)
 
 	/* Do we have ACL's? */
 	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
-		int error = check_acl(inode, mask);
+		int error = check_acl(mnt_userns, inode, mask);
 		if (error != -EAGAIN)
 			return error;
 	}
@@ -320,7 +350,8 @@ static int acl_permission_check(struct inode *inode, int mask)
 	 * about? Need to check group ownership if so.
 	 */
 	if (mask & (mode ^ (mode >> 3))) {
-		if (in_group_p(inode->i_gid))
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+		if (in_group_p(kgid))
 			mode >>= 3;
 	}
 
@@ -330,6 +361,7 @@ static int acl_permission_check(struct inode *inode, int mask)
 
 /**
  * generic_permission -  check for access rights on a Posix-like filesystem
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
  *		%MAY_NOT_BLOCK ...)
@@ -342,25 +374,32 @@ static int acl_permission_check(struct inode *inode, int mask)
  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
  * request cannot be satisfied (eg. requires blocking or too much complexity).
  * It would then be called again in ref-walk mode.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int generic_permission(struct inode *inode, int mask)
+int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		       int mask)
 {
 	int ret;
 
 	/*
 	 * Do the basic permission checks.
 	 */
-	ret = acl_permission_check(inode, mask);
+	ret = acl_permission_check(mnt_userns, inode, mask);
 	if (ret != -EACCES)
 		return ret;
 
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
 		if (!(mask & MAY_WRITE))
-			if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+			if (capable_wrt_inode_uidgid(mnt_userns, inode,
 						     CAP_DAC_READ_SEARCH))
 				return 0;
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_OVERRIDE))
 			return 0;
 		return -EACCES;
@@ -371,7 +410,7 @@ int generic_permission(struct inode *inode, int mask)
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ)
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_READ_SEARCH))
 			return 0;
 	/*
@@ -380,7 +419,7 @@ int generic_permission(struct inode *inode, int mask)
 	 * at least one exec bit set.
 	 */
 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_OVERRIDE))
 			return 0;
 
@@ -388,13 +427,19 @@ int generic_permission(struct inode *inode, int mask)
 }
 EXPORT_SYMBOL(generic_permission);
 
-/*
+/**
+ * do_inode_permission - UNIX permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
  * We _really_ want to just do "generic_permission()" without
  * even looking at the inode->i_op values. So we keep a cache
  * flag in inode->i_opflags, that says "this has not special
  * permission function, use the fast case".
  */
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct user_namespace *mnt_userns,
+				      struct inode *inode, int mask)
 {
 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 		if (likely(inode->i_op->permission))
@@ -405,7 +450,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
 		inode->i_opflags |= IOP_FASTPERM;
 		spin_unlock(&inode->i_lock);
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(mnt_userns, inode, mask);
 }
 
 /**
@@ -430,8 +475,9 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 
 /**
  * inode_permission - Check for access rights to a given inode
- * @inode: Inode to check permission on
- * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mnt_userns:	User namespace of the mount the inode was found from
+ * @inode:	Inode to check permission on
+ * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
  * this, letting us set arbitrary permissions for filesystem access without
@@ -439,7 +485,8 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
  *
  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  */
-int inode_permission(struct inode *inode, int mask)
+int inode_permission(struct user_namespace *mnt_userns,
+		     struct inode *inode, int mask)
 {
 	int retval;
 
@@ -463,7 +510,7 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	retval = do_inode_permission(inode, mask);
+	retval = do_inode_permission(mnt_userns, inode, mask);
 	if (retval)
 		return retval;
 
@@ -1009,7 +1056,7 @@ static bool safe_hardlink_source(struct inode *inode)
 		return false;
 
 	/* Hardlinking to unreadable or unwritable sources is dangerous. */
-	if (inode_permission(inode, MAY_READ | MAY_WRITE))
+	if (inode_permission(&init_user_ns, inode, MAY_READ | MAY_WRITE))
 		return false;
 
 	return true;
@@ -1569,13 +1616,14 @@ static struct dentry *lookup_slow(const struct qstr *name,
 static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+		int err = inode_permission(&init_user_ns, nd->inode,
+					   MAY_EXEC | MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
-	return inode_permission(nd->inode, MAY_EXEC);
+	return inode_permission(&init_user_ns, nd->inode, MAY_EXEC);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
@@ -2509,7 +2557,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
 			return err;
 	}
 
-	return inode_permission(base->d_inode, MAY_EXEC);
+	return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
 }
 
 /**
@@ -2703,7 +2751,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -2747,7 +2795,7 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
 		return -EOVERFLOW;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2877,7 +2925,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(inode, MAY_OPEN | acc_mode);
+	error = inode_permission(&init_user_ns, inode, MAY_OPEN | acc_mode);
 	if (error)
 		return error;
 
@@ -2939,7 +2987,8 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
 		return -EOVERFLOW;
 
-	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir->dentry->d_inode,
+				 MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -3276,7 +3325,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
 	int error;
 
 	/* we want directory to be writable */
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_err;
 	error = -EOPNOTSUPP;
@@ -4267,12 +4316,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(source, MAY_WRITE);
+			error = inode_permission(&init_user_ns, source,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(target, MAY_WRITE);
+			error = inode_permission(&init_user_ns, target,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ef827ae193d2..727e01a84503 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2987,7 +2987,7 @@ out_notsup:
 
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask);
+		res = generic_permission(&init_user_ns, inode, mask);
 	goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 66f2ef67792a..8d90796e236a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -40,7 +40,8 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = inode_permission(d_inode(parent), MAY_EXEC);
+		err = inode_permission(&init_user_ns,
+				       d_inode(parent), MAY_EXEC);
 		if (err < 0) {
 			dput(parent);
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..0edf11258aaa 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2391,13 +2391,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		return 0;
 
 	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+	err = inode_permission(&init_user_ns, inode,
+			       acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
 	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
-		err = inode_permission(inode, MAY_EXEC);
+		err = inode_permission(&init_user_ns, inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 745d371d6fea..b6517220cad5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -851,7 +851,7 @@ int nilfs_permission(struct inode *inode, int mask)
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 85979e2214b3..0c75619adf54 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1355,7 +1355,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		dump_stack();
 	}
 
-	ret = generic_permission(inode, mask);
+	ret = generic_permission(&init_user_ns, inode, mask);
 
 	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
 out:
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3b397fa9c9e8..c26937824be1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4346,7 +4346,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /**
@@ -4400,7 +4400,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 	 * file.
 	 */
 	if (!preserve) {
-		error = inode_permission(inode, MAY_READ);
+		error = inode_permission(&init_user_ns, inode, MAY_READ);
 		if (error)
 			return error;
 	}
diff --git a/fs/open.c b/fs/open.c
index cd1efd254cad..a6dac6d97988 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -83,7 +83,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	error = inode_permission(inode, MAY_WRITE);
+	error = inode_permission(&init_user_ns, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -436,7 +436,7 @@ retry:
 			goto out_path_release;
 	}
 
-	res = inode_permission(inode, mode | MAY_ACCESS);
+	res = inode_permission(&init_user_ns, inode, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 48f0547d4850..4c790cc8042d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -933,7 +933,7 @@ int orangefs_permission(struct inode *inode, int mask)
 	if (ret < 0)
 		return ret;
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index bd9dd38347ae..b2948e7b3210 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -50,7 +50,7 @@ static struct file *ovl_open_realfile(const struct file *file,
 		acc_mode |= MAY_APPEND;
 
 	old_cred = ovl_override_creds(inode->i_sb);
-	err = inode_permission(realinode, MAY_OPEN | acc_mode);
+	err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
 	if (err) {
 		realfile = ERR_PTR(err);
 	} else {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d739e14c6814..c101ebbb7a77 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -294,7 +294,7 @@ int ovl_permission(struct inode *inode, int mask)
 	 * Check overlay inode with the creds of task and underlying inode
 	 * with creds of mounter
 	 */
-	err = generic_permission(inode, mask);
+	err = generic_permission(&init_user_ns, inode, mask);
 	if (err)
 		return err;
 
@@ -305,7 +305,7 @@ int ovl_permission(struct inode *inode, int mask)
 		/* Make sure mounter can read file for copy up later */
 		mask |= MAY_READ;
 	}
-	err = inode_permission(realinode, mask);
+	err = inode_permission(&init_user_ns, realinode, mask);
 	revert_creds(old_cred);
 
 	return err;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6569031af3cd..de5c2047a0e9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -479,7 +479,7 @@ struct file *ovl_path_open(struct path *path, int flags)
 		BUG();
 	}
 
-	err = inode_permission(inode, acc_mode | MAY_OPEN);
+	err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4ca6d53c6f0a..5d9fe2fb2953 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -345,10 +345,13 @@ EXPORT_SYMBOL(posix_acl_from_mode);
  * by the acl. Returns -E... otherwise.
  */
 int
-posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
+posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct posix_acl *acl, int want)
 {
 	const struct posix_acl_entry *pa, *pe, *mask_obj;
 	int found = 0;
+	kuid_t uid;
+	kgid_t gid;
 
 	want &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
@@ -356,22 +359,26 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
                 switch(pa->e_tag) {
                         case ACL_USER_OBJ:
 				/* (May have been checked already) */
-				if (uid_eq(inode->i_uid, current_fsuid()))
+				uid = i_uid_into_mnt(mnt_userns, inode);
+				if (uid_eq(uid, current_fsuid()))
                                         goto check_perm;
                                 break;
                         case ACL_USER:
-				if (uid_eq(pa->e_uid, current_fsuid()))
+				uid = kuid_into_mnt(mnt_userns, pa->e_uid);
+				if (uid_eq(uid, current_fsuid()))
                                         goto mask;
 				break;
                         case ACL_GROUP_OBJ:
-                                if (in_group_p(inode->i_gid)) {
+				gid = i_gid_into_mnt(mnt_userns, inode);
+				if (in_group_p(gid)) {
 					found = 1;
 					if ((pa->e_perm & want) == want)
 						goto mask;
                                 }
 				break;
                         case ACL_GROUP:
-				if (in_group_p(pa->e_gid)) {
+				gid = kgid_into_mnt(mnt_userns, pa->e_gid);
+				if (in_group_p(gid)) {
 					found = 1;
 					if ((pa->e_perm & want) == want)
 						goto mask;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b3422cda2a91..b4ec9293625e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -751,7 +751,7 @@ static int proc_pid_permission(struct inode *inode, int mask)
 
 		return -EPERM;
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 
@@ -3492,7 +3492,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
 		return 0;
 	}
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static const struct inode_operations proc_tid_comm_inode_operations = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index cb51763ed554..d6e76461e135 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -281,7 +281,7 @@ int proc_fd_permission(struct inode *inode, int mask)
 	struct task_struct *p;
 	int rv;
 
-	rv = generic_permission(inode, mask);
+	rv = generic_permission(&init_user_ns, inode, mask);
 	if (rv == 0)
 		return rv;
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index fe63a7c3e0da..ec440d1957a1 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -957,7 +957,7 @@ int reiserfs_permission(struct inode *inode, int mask)
 	if (IS_PRIVATE(inode))
 		return 0;
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 77dba3a49e65..29a4a4dbfe12 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -438,7 +438,7 @@ static bool allow_file_dedupe(struct file *file)
 		return true;
 	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
 		return true;
-	if (!inode_permission(file_inode(file), MAY_WRITE))
+	if (!inode_permission(&init_user_ns, file_inode(file), MAY_WRITE))
 		return true;
 	return false;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index fd57153b1f61..56151bd9e642 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -131,7 +131,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return inode_permission(inode, mask);
+	return inode_permission(&init_user_ns, inode, mask);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcd17097d441..a85dfe6962df 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2810,15 +2810,17 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 #endif
 
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
-extern int inode_permission(struct inode *, int);
-extern int generic_permission(struct inode *, int);
+int inode_permission(struct user_namespace *, struct inode *, int);
+int generic_permission(struct user_namespace *, struct inode *, int);
 static inline int file_permission(struct file *file, int mask)
 {
-	return inode_permission(file_inode(file), mask);
+	return inode_permission(file_mnt_user_ns(file),
+				file_inode(file), mask);
 }
 static inline int path_permission(const struct path *path, int mask)
 {
-	return inode_permission(d_inode(path->dentry), mask);
+	return inode_permission(mnt_user_ns(path->mnt),
+				d_inode(path->dentry), mask);
 }
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 90797f1b421d..85fb4c0c650a 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -15,6 +15,8 @@
 #include <linux/refcount.h>
 #include <uapi/linux/posix_acl.h>
 
+struct user_namespace;
+
 struct posix_acl_entry {
 	short			e_tag;
 	unsigned short		e_perm;
@@ -61,8 +63,6 @@ posix_acl_release(struct posix_acl *acl)
 
 extern void posix_acl_init(struct posix_acl *, int);
 extern struct posix_acl *posix_acl_alloc(int, gfp_t);
-extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
-extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
 extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
@@ -85,6 +85,9 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
 void forget_cached_acl(struct inode *inode, int type);
 void forget_all_cached_acls(struct inode *inode);
+int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
+int posix_acl_permission(struct user_namespace *, struct inode *,
+			 const struct posix_acl *, int);
 
 static inline void cache_no_acl(struct inode *inode)
 {
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index beff0cfcd1e8..693f01fe1216 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -873,7 +873,7 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro,
 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
 		return -EINVAL;
 	acc = oflag2acc[oflag & O_ACCMODE];
-	return inode_permission(d_inode(dentry), acc);
+	return inode_permission(&init_user_ns, d_inode(dentry), acc);
 }
 
 static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 8962f139521e..e3226b65f5dc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -558,7 +558,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
 {
 	struct bpf_prog *prog;
-	int ret = inode_permission(inode, MAY_READ);
+	int ret = inode_permission(&init_user_ns, inode, MAY_READ);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 613845769103..091ffb5d2939 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4670,7 +4670,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
 	if (!inode)
 		return -ENOMEM;
 
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
 	iput(inode);
 	return ret;
 }
-- 
cgit v1.2.3


From 21cb47be6fb9ece7e6ee63f6780986faa384a77c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:25 +0100
Subject: inode: make init and permission helpers idmapped mount aware

The inode_owner_or_capable() helper determines whether the caller is the
owner of the inode or is capable with respect to that inode. Allow it to
handle idmapped mounts. If the inode is accessed through an idmapped
mount it according to the mount's user namespace. Afterwards the checks
are identical to non-idmapped mounts. If the initial user namespace is
passed nothing changes so non-idmapped mounts will see identical
behavior as before.

Similarly, allow the inode_init_owner() helper to handle idmapped
mounts. It initializes a new inode on idmapped mounts by mapping the
fsuid and fsgid of the caller from the mount's user namespace. If the
initial user namespace is passed nothing changes so non-idmapped mounts
will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-7-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/9p/acl.c                  |  2 +-
 fs/9p/vfs_inode.c            |  2 +-
 fs/attr.c                    |  6 +++---
 fs/bfs/dir.c                 |  2 +-
 fs/btrfs/inode.c             |  2 +-
 fs/btrfs/ioctl.c             | 10 +++++-----
 fs/btrfs/tests/btrfs-tests.c |  2 +-
 fs/crypto/policy.c           |  2 +-
 fs/efivarfs/file.c           |  2 +-
 fs/ext2/ialloc.c             |  2 +-
 fs/ext2/ioctl.c              |  6 +++---
 fs/ext4/ialloc.c             |  2 +-
 fs/ext4/ioctl.c              | 15 ++++++++-------
 fs/f2fs/file.c               | 14 +++++++-------
 fs/f2fs/namei.c              |  2 +-
 fs/f2fs/xattr.c              |  2 +-
 fs/fcntl.c                   |  2 +-
 fs/gfs2/file.c               |  2 +-
 fs/hfsplus/inode.c           |  2 +-
 fs/hfsplus/ioctl.c           |  2 +-
 fs/hugetlbfs/inode.c         |  2 +-
 fs/inode.c                   | 36 ++++++++++++++++++++++++++----------
 fs/jfs/ioctl.c               |  2 +-
 fs/jfs/jfs_inode.c           |  2 +-
 fs/minix/bitmap.c            |  2 +-
 fs/namei.c                   |  5 +++--
 fs/nilfs2/inode.c            |  2 +-
 fs/nilfs2/ioctl.c            |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c       |  4 ++--
 fs/ocfs2/ioctl.c             |  2 +-
 fs/ocfs2/namei.c             |  2 +-
 fs/omfs/inode.c              |  2 +-
 fs/overlayfs/dir.c           |  2 +-
 fs/overlayfs/file.c          |  4 ++--
 fs/overlayfs/super.c         |  2 +-
 fs/overlayfs/util.c          |  2 +-
 fs/posix_acl.c               |  2 +-
 fs/ramfs/inode.c             |  2 +-
 fs/reiserfs/ioctl.c          |  4 ++--
 fs/reiserfs/namei.c          |  2 +-
 fs/sysv/ialloc.c             |  2 +-
 fs/ubifs/dir.c               |  2 +-
 fs/ubifs/ioctl.c             |  2 +-
 fs/udf/ialloc.c              |  2 +-
 fs/ufs/ialloc.c              |  2 +-
 fs/xattr.c                   |  3 ++-
 fs/xfs/xfs_ioctl.c           |  2 +-
 fs/zonefs/super.c            |  2 +-
 include/linux/fs.h           |  8 ++++----
 kernel/bpf/inode.c           |  2 +-
 mm/madvise.c                 |  3 ++-
 mm/mincore.c                 |  3 ++-
 mm/shmem.c                   |  2 +-
 security/selinux/hooks.c     |  4 ++--
 54 files changed, 112 insertions(+), 91 deletions(-)

(limited to 'include')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6261719f6f2a..d77b28e8d57a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -258,7 +258,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 	if (value) {
 		/* update the cached acl value */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4a937fac1acb..f66eb3c12c8a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -251,7 +251,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 {
 	int err = 0;
 
-	inode_init_owner(inode, NULL, mode);
+	inode_init_owner(&init_user_ns,inode,  NULL, mode);
 	inode->i_blocks = 0;
 	inode->i_rdev = rdev;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/attr.c b/fs/attr.c
index c9e29e589cec..00ae0b000146 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -87,7 +87,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -98,7 +98,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 	}
 
@@ -243,7 +243,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 		if (IS_IMMUTABLE(inode))
 			return -EPERM;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			error = inode_permission(&init_user_ns, inode,
 						 MAY_WRITE);
 			if (error)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d8dfe3a0cb39..be1335a8d25b 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -96,7 +96,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 	set_bit(ino, info->si_imap);
 	info->si_freei--;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 512ee2650bbb..07fe8b2f3bab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6190,7 +6190,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail_unlock;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode_set_bytes(inode, 0);
 
 	inode->i_mtime = current_time(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ced6dfefee4..1f763c60415b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -213,7 +213,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	const char *comp = NULL;
 	u32 binode_flags;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (btrfs_root_readonly(root))
@@ -429,7 +429,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
 	unsigned old_i_flags;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (btrfs_root_readonly(root))
@@ -1862,7 +1862,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 			btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
 				   "Snapshot src from another FS");
 			ret = -EXDEV;
-		} else if (!inode_owner_or_capable(src_inode)) {
+		} else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
 			/*
 			 * Subvolume creation is not restricted, but snapshots
 			 * are limited to own subvolumes only
@@ -1982,7 +1982,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	u64 flags;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
@@ -4453,7 +4453,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 	int ret = 0;
 	int received_uuid_changed;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 6bd97bd4cb37..3a4099a2bf05 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -62,7 +62,7 @@ struct inode *btrfs_new_test_inode(void)
 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
 	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	BTRFS_I(inode)->location.offset = 0;
-	inode_init_owner(inode, NULL, S_IFREG);
+	inode_init_owner(&init_user_ns, inode, NULL, S_IFREG);
 
 	return inode;
 }
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index a51cef6bd27f..ed3d623724cd 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -465,7 +465,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 		return -EFAULT;
 	policy.version = version;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index feaa5e182b7b..e6bc0302643b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -137,7 +137,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	unsigned int oldflags = efivarfs_getflags(inode);
 	int error;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (copy_from_user(&flags, arg, sizeof(flags)))
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 432c3febea6d..df14e750e9fe 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -551,7 +551,7 @@ got:
 		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
 	} else
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 32a8d10b579d..b399cbb7022d 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (ret)
 			return ret;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			ret = -EACCES;
 			goto setflags_out;
 		}
@@ -84,7 +84,7 @@ setflags_out:
 	case EXT2_IOC_SETVERSION: {
 		__u32 generation;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 		ret = mnt_want_write_file(filp);
 		if (ret)
@@ -117,7 +117,7 @@ setversion_out:
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b215c564bc31..00c1ec6eee16 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -972,7 +972,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
 	} else
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	if (ext4_has_feature_project(sb) &&
 	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d9665d2f82db..ab80e2493fdc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -139,7 +139,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	}
 
 	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+	    !inode_owner_or_capable(&init_user_ns, inode) ||
+	    !capable(CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto journal_err_out;
 	}
@@ -829,7 +830,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case FS_IOC_SETFLAGS: {
 		int err;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -871,7 +872,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		__u32 generation;
 		int err;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 
 		if (ext4_has_metadata_csum(inode->i_sb)) {
@@ -1010,7 +1011,7 @@ mext_out:
 	case EXT4_IOC_MIGRATE:
 	{
 		int err;
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		err = mnt_want_write_file(filp);
@@ -1032,7 +1033,7 @@ mext_out:
 	case EXT4_IOC_ALLOC_DA_BLKS:
 	{
 		int err;
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		err = mnt_want_write_file(filp);
@@ -1217,7 +1218,7 @@ resizefs_out:
 
 	case EXT4_IOC_CLEAR_ES_CACHE:
 	{
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 		ext4_clear_inode_es(inode);
 		return 0;
@@ -1263,7 +1264,7 @@ resizefs_out:
 			return -EFAULT;
 
 		/* Make sure caller has proper permission */
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f585545277d7..5fc0ff28b5dd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1961,7 +1961,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	u32 iflags;
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (get_user(fsflags, (int __user *)arg))
@@ -2008,7 +2008,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (!S_ISREG(inode->i_mode))
@@ -2075,7 +2075,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -2117,7 +2117,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (!S_ISREG(inode->i_mode))
@@ -2152,7 +2152,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -2181,7 +2181,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -3158,7 +3158,7 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 		return -EFAULT;
 
 	/* Make sure caller has proper permission */
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6edb1ab579a1..ad98926bacac 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	nid_free = true;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 65afcc3cc68a..d772bf13a814 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -114,7 +114,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 	unsigned char old_advise = F2FS_I(inode)->i_advise;
 	unsigned char new_advise;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 	if (value == NULL)
 		return -EINVAL;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 05b36b28f2e8..74d99731fd43 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -46,7 +46,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 
 	/* required for strict SunOS emulation */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b39b339feddc..1d994bdfffaa 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -238,7 +238,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
 		goto out;
 
 	error = -EACCES;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto out;
 
 	error = 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e3da9e96b835..21357046b039 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -376,7 +376,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
 		return NULL;
 
 	inode->i_ino = sbi->next_cnid++;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	set_nlink(inode, 1);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ce15b9496b77..3edb1926d127 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -91,7 +91,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	if (err)
 		goto out;
 
-	if (!inode_owner_or_capable(inode)) {
+	if (!inode_owner_or_capable(&init_user_ns, inode)) {
 		err = -EACCES;
 		goto out_drop_write;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b5c109703daa..6737929e443c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -836,7 +836,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
diff --git a/fs/inode.c b/fs/inode.c
index cd40cbf87ce4..a9ac97a27784 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2130,14 +2130,21 @@ EXPORT_SYMBOL(init_special_inode);
 
 /**
  * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+ * @mnt_userns:	User namespace of the mount the inode was created from
  * @inode: New inode
  * @dir: Directory inode
  * @mode: mode of the new inode
+ *
+ * If the inode has been created through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions
+ * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
+ * checking is to be performed on the raw inode simply passs init_user_ns.
  */
-void inode_init_owner(struct inode *inode, const struct inode *dir,
-			umode_t mode)
+void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+		      const struct inode *dir, umode_t mode)
 {
-	inode->i_uid = current_fsuid();
+	inode->i_uid = fsuid_into_mnt(mnt_userns);
 	if (dir && dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 
@@ -2145,32 +2152,41 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-			 !in_group_p(inode->i_gid) &&
-			 !capable_wrt_inode_uidgid(&init_user_ns, dir,
-						   CAP_FSETID))
+			 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
+			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
-		inode->i_gid = current_fsgid();
+		inode->i_gid = fsgid_into_mnt(mnt_userns);
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
 
 /**
  * inode_owner_or_capable - check current task permissions to inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: inode being checked
  *
  * Return true if current either has CAP_FOWNER in a namespace with the
  * inode owner uid mapped, or owns the file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-bool inode_owner_or_capable(const struct inode *inode)
+bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+			    const struct inode *inode)
 {
+	kuid_t i_uid;
 	struct user_namespace *ns;
 
-	if (uid_eq(current_fsuid(), inode->i_uid))
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), i_uid))
 		return true;
 
 	ns = current_user_ns();
-	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
+	if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
 		return true;
 	return false;
 }
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 10ee0ecca1a8..2581d4db58ff 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -76,7 +76,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			err = -EACCES;
 			goto setflags_out;
 		}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 4cef170630db..59379089e939 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		goto fail_put;
 	}
 
-	inode_init_owner(inode, parent, mode);
+	inode_init_owner(&init_user_ns, inode, parent, mode);
 	/*
 	 * New inodes need to save sane values on disk when
 	 * uid & gid mount options are used
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index f4e5e5181a14..9115948c624e 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error)
 		iput(inode);
 		return NULL;
 	}
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = j;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
diff --git a/fs/namei.c b/fs/namei.c
index d78d74f5f5af..04b001ddade3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1088,7 +1088,8 @@ int may_linkat(struct path *link)
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+	if (safe_hardlink_source(inode) ||
+	    inode_owner_or_capable(&init_user_ns, inode))
 		return 0;
 
 	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@ -2940,7 +2941,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
-	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+	if (flag & O_NOATIME && !inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	return 0;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b6517220cad5..11225a659736 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -348,7 +348,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
 	atomic64_inc(&root->inodes_count);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 07d26f61f22a..b053b40315bf 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -132,7 +132,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 	unsigned int flags, oldflags;
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (get_user(flags, (int __user *)argp))
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 583820ec63e2..37c7d03a6284 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -329,7 +329,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, NULL, mode);
+		inode_init_owner(&init_user_ns, inode, NULL, mode);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inc_nlink(inode);
 
@@ -352,7 +352,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 		return NULL;
 
 	inode->i_ino = get_next_ino();
-	inode_init_owner(inode, parent, mode);
+	inode_init_owner(&init_user_ns, inode, parent, mode);
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 
 	ip = DLMFS_I(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 89984172fc4a..50c9b30ee9f6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -96,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	}
 
 	status = -EACCES;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto bail_unlock;
 
 	if (!S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2a237ab00453..908b79e1082b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -198,7 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
 	 * callers. */
 	if (S_ISDIR(mode))
 		set_nlink(inode, 2);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	status = dquot_initialize(inode);
 	if (status)
 		return ERR_PTR(status);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ce93ccca8639..2a0e83236c01 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
 		goto fail;
 
 	inode->i_ino = new_block;
-	inode_init_owner(inode, NULL, mode);
+	inode_init_owner(&init_user_ns, inode, NULL, mode);
 	inode->i_mapping->a_ops = &omfs_aops;
 
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 28a075b5f5b2..98a23353b19a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -636,7 +636,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 	inode->i_state |= I_CREATING;
 	spin_unlock(&inode->i_lock);
 
-	inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+	inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
 	attr.mode = inode->i_mode;
 
 	err = ovl_create_or_link(dentry, inode, &attr, false);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index b2948e7b3210..7d8b84c715b3 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -54,7 +54,7 @@ static struct file *ovl_open_realfile(const struct file *file,
 	if (err) {
 		realfile = ERR_PTR(err);
 	} else {
-		if (!inode_owner_or_capable(realinode))
+		if (!inode_owner_or_capable(&init_user_ns, realinode))
 			flags &= ~O_NOATIME;
 
 		realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@ -520,7 +520,7 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
 	long ret;
 	struct inode *inode = file_inode(file);
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(file);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 88d877787770..3e925deaa19a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1005,7 +1005,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 		goto out_acl_release;
 	}
 	err = -EPERM;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto out_acl_release;
 
 	posix_acl_release(acl);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index de5c2047a0e9..06013b7b1e87 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -484,7 +484,7 @@ struct file *ovl_path_open(struct path *path, int flags)
 		return ERR_PTR(err);
 
 	/* O_NOATIME is an optimization, don't fail if not permitted */
-	if (inode_owner_or_capable(inode))
+	if (inode_owner_or_capable(&init_user_ns, inode))
 		flags |= O_NOATIME;
 
 	return dentry_open(path, flags, current_cred());
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5d9fe2fb2953..9ce8214bfdac 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -874,7 +874,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return acl ? -EACCES : 0;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (acl) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ee179a81b3da..3fd4326f36b5 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -67,7 +67,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adb21bea3d60..4f1cbd930179 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (err)
 				break;
 
-			if (!inode_owner_or_capable(inode)) {
+			if (!inode_owner_or_capable(&init_user_ns, inode)) {
 				err = -EPERM;
 				goto setflags_out;
 			}
@@ -101,7 +101,7 @@ setflags_out:
 		err = put_user(inode->i_generation, (int __user *)arg);
 		break;
 	case REISERFS_IOC_SETVERSION:
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			err = -EPERM;
 			break;
 		}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 1594687582f0..a67a7d371725 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -615,7 +615,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
 	 * the quota init calls have to know who to charge the quota to, so
 	 * we have to set uid and gid here
 	 */
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	return dquot_initialize(inode);
 }
 
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6c9801986af6..50df794a3c1f 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
 	*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
 	fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
 	dirty_sb(sb);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = fs16_to_cpu(sbi, ino);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9a6b8660425a..694e7714545b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -94,7 +94,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 	 */
 	inode->i_flags |= S_NOCMTIME;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 current_time(inode);
 	inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4363d85a3fd4..2326d5122beb 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -155,7 +155,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 84ed23edebfd..2ecf0e87660e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -103,7 +103,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 		mutex_unlock(&sbi->s_alloc_mutex);
 	}
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
 		inode->i_uid = sbi->s_uid;
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 969fd60436d3..7e3e08c0166f 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -289,7 +289,7 @@ cg_found:
 	ufs_mark_sb_dirty(sb);
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_blocks = 0;
 	inode->i_generation = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 56151bd9e642..c669922e1bde 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -127,7 +127,8 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
+		    (mask & MAY_WRITE) &&
+		    !inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 	}
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 97bd29fc8c43..218e80afc859 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1300,7 +1300,7 @@ xfs_ioctl_setattr_get_trans(
 	 * The user ID of the calling process must be equal to the file owner
 	 * ID, except in cases where the CAP_FSETID capability is applicable.
 	 */
-	if (!inode_owner_or_capable(VFS_I(ip))) {
+	if (!inode_owner_or_capable(&init_user_ns, VFS_I(ip))) {
 		error = -EPERM;
 		goto out_cancel;
 	}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bec47f2d074b..569525ee8f69 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1223,7 +1223,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
 	struct super_block *sb = parent->i_sb;
 
 	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
-	inode_init_owner(inode, parent, S_IFDIR | 0555);
+	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
 	inode->i_op = &zonefs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a85dfe6962df..2a9d4af6a64d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1762,8 +1762,8 @@ static inline bool sb_start_intwrite_trylock(struct super_block *sb)
 	return __sb_start_write_trylock(sb, SB_FREEZE_FS);
 }
 
-
-extern bool inode_owner_or_capable(const struct inode *inode);
+bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+			    const struct inode *inode);
 
 /*
  * VFS helper functions..
@@ -1805,8 +1805,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
 /*
  * VFS file helper functions.
  */
-extern void inode_init_owner(struct inode *inode, const struct inode *dir,
-			umode_t mode);
+void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+		      const struct inode *dir, umode_t mode);
 extern bool may_open_dev(const struct path *path);
 
 /*
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e3226b65f5dc..05b1f51d15e0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime;
 	inode->i_ctime = inode->i_atime;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	return inode;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 175c5582d8a9..d4f5eece9d56 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -539,7 +539,8 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
 	 * otherwise we'd be including shared non-exclusive mappings, which
 	 * opens a side channel.
 	 */
-	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+	return inode_owner_or_capable(&init_user_ns,
+				      file_inode(vma->vm_file)) ||
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 7bdb4673f776..9122676b54d6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -166,7 +166,8 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
 	 * for writing; otherwise we'd be including shared non-exclusive
 	 * mappings, which opens a side channel.
 	 */
-	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+	return inode_owner_or_capable(&init_user_ns,
+				      file_inode(vma->vm_file)) ||
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c6b6d8f6c39..1c68c9edba5e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2303,7 +2303,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = ino;
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inode->i_generation = prandom_u32();
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 644b17ec9e63..9d6d3da2caf2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3140,13 +3140,13 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 	}
 
 	if (!selinux_initialized(&selinux_state))
-		return (inode_owner_or_capable(inode) ? 0 : -EPERM);
+		return (inode_owner_or_capable(&init_user_ns, inode) ? 0 : -EPERM);
 
 	sbsec = inode->i_sb->s_security;
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ad.type = LSM_AUDIT_DATA_DENTRY;
-- 
cgit v1.2.3


From 2f221d6f7b881d95de1f356a3097d755ab1e47d4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:26 +0100
Subject: attr: handle idmapped mounts

When file attributes are changed most filesystems rely on the
setattr_prepare(), setattr_copy(), and notify_change() helpers for
initialization and permission checking. Let them handle idmapped mounts.
If the inode is accessed through an idmapped mount map it into the
mount's user namespace. Afterwards the checks are identical to
non-idmapped mounts. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Helpers that perform checks on the ia_uid and ia_gid fields in struct
iattr assume that ia_uid and ia_gid are intended values and have already
been mapped correctly at the userspace-kernelspace boundary as we
already do today. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-8-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/powerpc/platforms/cell/spufs/inode.c |   2 +-
 drivers/base/devtmpfs.c                   |   4 +-
 fs/9p/vfs_inode.c                         |   4 +-
 fs/9p/vfs_inode_dotl.c                    |   4 +-
 fs/adfs/inode.c                           |   2 +-
 fs/affs/inode.c                           |   4 +-
 fs/attr.c                                 | 119 ++++++++++++++++++++++--------
 fs/btrfs/inode.c                          |   4 +-
 fs/cachefiles/interface.c                 |   4 +-
 fs/ceph/inode.c                           |   2 +-
 fs/cifs/inode.c                           |   8 +-
 fs/ecryptfs/inode.c                       |   7 +-
 fs/exfat/file.c                           |   4 +-
 fs/ext2/inode.c                           |   4 +-
 fs/ext4/inode.c                           |   4 +-
 fs/f2fs/file.c                            |  10 ++-
 fs/fat/file.c                             |   4 +-
 fs/fuse/dir.c                             |   2 +-
 fs/gfs2/inode.c                           |   4 +-
 fs/hfs/inode.c                            |   4 +-
 fs/hfsplus/inode.c                        |   4 +-
 fs/hostfs/hostfs_kern.c                   |   4 +-
 fs/hpfs/inode.c                           |   4 +-
 fs/hugetlbfs/inode.c                      |   4 +-
 fs/inode.c                                |   2 +-
 fs/jffs2/fs.c                             |   2 +-
 fs/jfs/file.c                             |   4 +-
 fs/kernfs/inode.c                         |   4 +-
 fs/libfs.c                                |   4 +-
 fs/minix/file.c                           |   4 +-
 fs/nfsd/nfsproc.c                         |   2 +-
 fs/nfsd/vfs.c                             |   4 +-
 fs/nilfs2/inode.c                         |   4 +-
 fs/ntfs/inode.c                           |   2 +-
 fs/ocfs2/dlmfs/dlmfs.c                    |   4 +-
 fs/ocfs2/file.c                           |   4 +-
 fs/omfs/file.c                            |   4 +-
 fs/open.c                                 |   8 +-
 fs/orangefs/inode.c                       |   4 +-
 fs/overlayfs/copy_up.c                    |   8 +-
 fs/overlayfs/dir.c                        |   2 +-
 fs/overlayfs/inode.c                      |   4 +-
 fs/overlayfs/super.c                      |   2 +-
 fs/proc/base.c                            |   4 +-
 fs/proc/generic.c                         |   4 +-
 fs/proc/proc_sysctl.c                     |   4 +-
 fs/ramfs/file-nommu.c                     |   4 +-
 fs/reiserfs/inode.c                       |   4 +-
 fs/sysv/file.c                            |   4 +-
 fs/ubifs/file.c                           |   2 +-
 fs/udf/file.c                             |   4 +-
 fs/ufs/inode.c                            |   4 +-
 fs/utimes.c                               |   3 +-
 fs/xfs/xfs_iops.c                         |   2 +-
 fs/zonefs/super.c                         |   4 +-
 include/linux/fs.h                        |   8 +-
 mm/shmem.c                                |   4 +-
 57 files changed, 206 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 25390569e24c..3de526eb2275 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -98,7 +98,7 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    (attr->ia_size != inode->i_size))
 		return -EINVAL;
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index eac184e6d657..2e0c3cdb4184 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -221,7 +221,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		inode_lock(d_inode(dentry));
-		notify_change(dentry, &newattrs, NULL);
+		notify_change(&init_user_ns, dentry, &newattrs, NULL);
 		inode_unlock(d_inode(dentry));
 
 		/* mark as kernel-created inode */
@@ -328,7 +328,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 			newattrs.ia_valid =
 				ATTR_UID|ATTR_GID|ATTR_MODE;
 			inode_lock(d_inode(dentry));
-			notify_change(dentry, &newattrs, NULL);
+			notify_change(&init_user_ns, dentry, &newattrs, NULL);
 			inode_unlock(d_inode(dentry));
 			err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
 			if (!err || err == -ENOENT)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f66eb3c12c8a..9c3ff6e9ab82 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1062,7 +1062,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct p9_wstat wstat;
 
 	p9_debug(P9_DEBUG_VFS, "\n");
-	retval = setattr_prepare(dentry, iattr);
+	retval = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (retval)
 		return retval;
 
@@ -1118,7 +1118,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	v9fs_invalidate_inode_attr(d_inode(dentry));
 
-	setattr_copy(d_inode(dentry), iattr);
+	setattr_copy(&init_user_ns, d_inode(dentry), iattr);
 	mark_inode_dirty(d_inode(dentry));
 	return 0;
 }
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 823c2eb5f1bf..302553101fcb 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -549,7 +549,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 
 	p9_debug(P9_DEBUG_VFS, "\n");
 
-	retval = setattr_prepare(dentry, iattr);
+	retval = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (retval)
 		return retval;
 
@@ -590,7 +590,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 		truncate_setsize(inode, iattr->ia_size);
 
 	v9fs_invalidate_inode_attr(inode);
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		/* We also want to update ACL when we update mode bits */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 32620f4a7623..278dcee6ae22 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -299,7 +299,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	int error;
 	
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 
 	/*
 	 * we can't change the UID or GID of any file -
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 044412110b52..767e5bdfb703 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -223,7 +223,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto out;
 
@@ -249,7 +249,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 		affs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	if (attr->ia_valid & ATTR_MODE)
diff --git a/fs/attr.c b/fs/attr.c
index 00ae0b000146..f4543d2abdfb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,27 +18,55 @@
 #include <linux/evm.h>
 #include <linux/ima.h>
 
-static bool chown_ok(const struct inode *inode, kuid_t uid)
+/**
+ * chown_ok - verify permissions to chown inode
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @uid:	uid to chown @inode to
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
+static bool chown_ok(struct user_namespace *mnt_userns,
+		     const struct inode *inode,
+		     kuid_t uid)
 {
-	if (uid_eq(current_fsuid(), inode->i_uid) &&
-	    uid_eq(uid, inode->i_uid))
+	kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid))
 		return true;
-	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (uid_eq(inode->i_uid, INVALID_UID) &&
+	if (uid_eq(kuid, INVALID_UID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
 }
 
-static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+/**
+ * chgrp_ok - verify permissions to chgrp inode
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @gid:	gid to chown @inode to
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
+static bool chgrp_ok(struct user_namespace *mnt_userns,
+		     const struct inode *inode, kgid_t gid)
 {
-	if (uid_eq(current_fsuid(), inode->i_uid) &&
-	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+	kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
+	    (in_group_p(gid) || gid_eq(gid, kgid)))
 		return true;
-	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (gid_eq(inode->i_gid, INVALID_GID) &&
+	if (gid_eq(kgid, INVALID_GID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
@@ -46,6 +74,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
 
 /**
  * setattr_prepare - check if attribute changes to a dentry are allowed
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	dentry to check
  * @attr:	attributes to change
  *
@@ -55,10 +84,17 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
  * SGID bit from mode if user is not allowed to set it. Also file capabilities
  * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Should be called as the first thing in ->setattr implementations,
  * possibly after taking additional locks.
  */
-int setattr_prepare(struct dentry *dentry, struct iattr *attr)
+int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
+		    struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
@@ -78,27 +114,27 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 		goto kill_priv;
 
 	/* Make sure a caller can chown. */
-	if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
+	if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
-	if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
+	if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if (!inode_owner_or_capable(&init_user_ns, inode))
+		if (!inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
-		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+               if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+                                i_gid_into_mnt(mnt_userns, inode)) &&
+                    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-		if (!inode_owner_or_capable(&init_user_ns, inode))
+		if (!inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 	}
 
@@ -162,20 +198,33 @@ EXPORT_SYMBOL(inode_newsize_ok);
 
 /**
  * setattr_copy - copy simple metadata updates into the generic inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
  * setattr_copy must be called with i_mutex held.
  *
  * setattr_copy updates the inode's metadata with that specified
- * in attr. Noticeably missing is inode size update, which is more complex
+ * in attr on idmapped mounts. If file ownership is changed setattr_copy
+ * doesn't map ia_uid and ia_gid. It will asssume the caller has already
+ * provided the intended values. Necessary permission checks to determine
+ * whether or not the S_ISGID property needs to be removed are performed with
+ * the correct idmapped mount permission helpers.
+ * Noticeably missing is inode size update, which is more complex
  * as it requires pagecache updates.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * The inode is not marked as dirty after this operation. The rationale is
  * that for "simple" filesystems, the struct inode is the inode storage.
  * The caller is free to mark the inode dirty afterwards if needed.
  */
-void setattr_copy(struct inode *inode, const struct iattr *attr)
+void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
+		  const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -191,9 +240,9 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
-
-		if (!in_group_p(inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+		if (!in_group_p(kgid) &&
+		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
@@ -202,6 +251,7 @@ EXPORT_SYMBOL(setattr_copy);
 
 /**
  * notify_change - modify attributes of a filesytem object
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes
  * @delegated_inode: returns inode, if the inode is delegated
@@ -214,13 +264,23 @@ EXPORT_SYMBOL(setattr_copy);
  * retry.  Because breaking a delegation may take a long time, the
  * caller should drop the i_mutex before doing so.
  *
+ * If file ownership is changed notify_change() doesn't map ia_uid and
+ * ia_gid. It will asssume the caller has already provided the intended values.
+ *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.  Also, passing NULL is fine for callers holding
  * the file open for write, as there can be no conflicting delegation in
  * that case.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
@@ -243,9 +303,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 		if (IS_IMMUTABLE(inode))
 			return -EPERM;
 
-		if (!inode_owner_or_capable(&init_user_ns, inode)) {
-			error = inode_permission(&init_user_ns, inode,
-						 MAY_WRITE);
+		if (!inode_owner_or_capable(mnt_userns, inode)) {
+			error = inode_permission(mnt_userns, inode, MAY_WRITE);
 			if (error)
 				return error;
 		}
@@ -321,9 +380,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	/* Don't allow modifications of files with invalid uids or
 	 * gids unless those uids & gids are being made valid.
 	 */
-	if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+	if (!(ia_valid & ATTR_UID) &&
+	    !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
-	if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+	if (!(ia_valid & ATTR_GID) &&
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	error = security_inode_setattr(dentry, attr);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 07fe8b2f3bab..792191a8705b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5054,7 +5054,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -5065,7 +5065,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		inode_inc_iversion(inode);
 		err = btrfs_dirty_inode(inode);
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 4cea5fbf695e..5efa6a3702c0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -470,14 +470,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		_debug("discard tail %llx", oi_size);
 		newattrs.ia_valid = ATTR_SIZE;
 		newattrs.ia_size = oi_size & PAGE_MASK;
-		ret = notify_change(object->backer, &newattrs, NULL);
+		ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
 		if (ret < 0)
 			goto truncate_failed;
 	}
 
 	newattrs.ia_valid = ATTR_SIZE;
 	newattrs.ia_size = ni_size;
-	ret = notify_change(object->backer, &newattrs, NULL);
+	ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
 
 truncate_failed:
 	inode_unlock(d_inode(object->backer));
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e8a15ee09bc1..285d3baca27e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2247,7 +2247,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err != 0)
 		return err;
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a83b3a8ffaac..27554f71f744 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2610,7 +2610,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		attrs->ia_valid |= ATTR_FORCE;
 
-	rc = setattr_prepare(direntry, attrs);
+	rc = setattr_prepare(&init_user_ns, direntry, attrs);
 	if (rc < 0)
 		goto out;
 
@@ -2715,7 +2715,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	    attrs->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attrs->ia_size);
 
-	setattr_copy(inode, attrs);
+	setattr_copy(&init_user_ns, inode, attrs);
 	mark_inode_dirty(inode);
 
 	/* force revalidate when any of these times are set since some
@@ -2757,7 +2757,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		attrs->ia_valid |= ATTR_FORCE;
 
-	rc = setattr_prepare(direntry, attrs);
+	rc = setattr_prepare(&init_user_ns, direntry, attrs);
 	if (rc < 0) {
 		free_xid(xid);
 		return rc;
@@ -2913,7 +2913,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	    attrs->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attrs->ia_size);
 
-	setattr_copy(inode, attrs);
+	setattr_copy(&init_user_ns, inode, attrs);
 	mark_inode_dirty(inode);
 
 cifs_setattr_exit:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0b346baf110d..d3ea0c57b075 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -855,7 +855,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
 		inode_lock(d_inode(lower_dentry));
-		rc = notify_change(lower_dentry, &lower_ia, NULL);
+		rc = notify_change(&init_user_ns, lower_dentry,
+				   &lower_ia, NULL);
 		inode_unlock(d_inode(lower_dentry));
 	}
 	return rc;
@@ -934,7 +935,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
 
-	rc = setattr_prepare(dentry, ia);
+	rc = setattr_prepare(&init_user_ns, dentry, ia);
 	if (rc)
 		goto out;
 	if (ia->ia_valid & ATTR_SIZE) {
@@ -960,7 +961,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
 	inode_lock(d_inode(lower_dentry));
-	rc = notify_change(lower_dentry, &lower_ia, NULL);
+	rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL);
 	inode_unlock(d_inode(lower_dentry));
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a92478eabfa4..ace35aa8e64b 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -305,7 +305,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr)
 				ATTR_TIMES_SET);
 	}
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	attr->ia_valid = ia_valid;
 	if (error)
 		goto out;
@@ -340,7 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr)
 		up_write(&EXFAT_I(inode)->truncate_lock);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	exfat_truncate_atime(&inode->i_atime);
 	mark_inode_dirty(inode);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 78c417d3c898..06c0cf28c1a0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1669,7 +1669,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
@@ -1689,7 +1689,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (error)
 			return error;
 	}
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
 		error = posix_acl_chmod(inode, inode->i_mode);
 	mark_inode_dirty(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c173c8405856..8edfa3e226e6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5337,7 +5337,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				  ATTR_GID | ATTR_TIMES_SET))))
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -5512,7 +5512,7 @@ out_mmap_sem:
 	}
 
 	if (!error) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5fc0ff28b5dd..90d7b89176de 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -831,7 +831,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 }
 
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+static void __setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
+			   const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -847,8 +848,9 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
 
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if (!in_group_p(kgid) && !capable(CAP_FSETID))
 			mode &= ~S_ISGID;
 		set_acl_inode(inode, mode);
 	}
@@ -869,7 +871,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		!f2fs_is_compress_backend_ready(inode))
 		return -EOPNOTSUPP;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -945,7 +947,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		spin_unlock(&F2FS_I(inode)->i_size_lock);
 	}
 
-	__setattr_copy(inode, attr);
+	__setattr_copy(&init_user_ns, inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
 		err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..805b501467e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -480,7 +480,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~TIMES_SET_FLAGS;
 	}
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	attr->ia_valid = ia_valid;
 	if (error) {
 		if (sbi->options.quiet)
@@ -550,7 +550,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
 	attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 out:
 	return error;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7497009a5a45..74fdb6a7ebb3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1611,7 +1611,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	if (!fc->default_permissions)
 		attr->ia_valid |= ATTR_FORCE;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5b2ff0c74b67..59c25181d108 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1861,7 +1861,7 @@ int gfs2_permission(struct inode *inode, int mask)
 
 static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -1982,7 +1982,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto error;
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f35a37c65e5f..c646218b72bf 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -608,7 +608,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(dentry, attr); /* basic permission checks */
+	error = setattr_prepare(&init_user_ns, dentry, attr); /* basic permission checks */
 	if (error)
 		return error;
 
@@ -647,7 +647,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 						  current_time(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 21357046b039..ffa137f8234e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -246,7 +246,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -264,7 +264,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b841a05a2b8c..6970e29a5287 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -792,7 +792,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	int fd = HOSTFS_I(inode)->fd;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -849,7 +849,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attr->ia_size);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index eb8b4baf0f2e..8ba2152a78ba 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -274,7 +274,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
 		goto out_unlock;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto out_unlock;
 
@@ -288,7 +288,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 		hpfs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 
 	hpfs_write_inode(inode);
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6737929e443c..327e572b4e00 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -761,7 +761,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	BUG_ON(!inode);
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -780,7 +780,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 			return error;
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
index a9ac97a27784..49b512592dcd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1912,7 +1912,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(dentry, &newattrs, NULL);
+	return notify_change(&init_user_ns, dentry, &newattrs, NULL);
 }
 
 /*
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 78858f6e9583..67993808f4da 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -195,7 +195,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int rc;
 
-	rc = setattr_prepare(dentry, iattr);
+	rc = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (rc)
 		return rc;
 
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 930d2701f206..ff49876e9c9b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -90,7 +90,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int rc;
 
-	rc = setattr_prepare(dentry, iattr);
+	rc = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (rc)
 		return rc;
 
@@ -118,7 +118,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		jfs_truncate(inode);
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index ff5598cc1de0..86bd4c593b78 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -122,7 +122,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
 		return -EINVAL;
 
 	mutex_lock(&kernfs_mutex);
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
 
@@ -131,7 +131,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
 		goto out;
 
 	/* this ignores size changes */
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
 	mutex_unlock(&kernfs_mutex);
diff --git a/fs/libfs.c b/fs/libfs.c
index f8b3c02b4f0f..a73fe109403c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -497,13 +497,13 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
 		truncate_setsize(inode, iattr->ia_size);
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/minix/file.c b/fs/minix/file.c
index c50b0a20fcd9..f07acd268577 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -27,7 +27,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -41,7 +41,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 		minix_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 9473d048efec..0ea0554d20d1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -90,7 +90,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
 		if (delta < 0)
 			delta = -delta;
 		if (delta < MAX_TOUCH_TIME_ERROR &&
-		    setattr_prepare(fhp->fh_dentry, iap) != 0) {
+		    setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) {
 			/*
 			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
 			 * This will cause notify_change to set these times
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0edf11258aaa..1905b39be1c2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -448,7 +448,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			.ia_size	= iap->ia_size,
 		};
 
-		host_err = notify_change(dentry, &size_attr, NULL);
+		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
 		if (host_err)
 			goto out_unlock;
 		iap->ia_valid &= ~ATTR_SIZE;
@@ -463,7 +463,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	}
 
 	iap->ia_valid |= ATTR_CTIME;
-	host_err = notify_change(dentry, iap, NULL);
+	host_err = notify_change(&init_user_ns, dentry, iap, NULL);
 
 out_unlock:
 	fh_unlock(fhp);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 11225a659736..8aad3c48092a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -812,7 +812,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	int err;
 
-	err = setattr_prepare(dentry, iattr);
+	err = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (err)
 		return err;
 
@@ -827,7 +827,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		nilfs_truncate(inode);
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE) {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f7e4cbc26eaf..38f4cf1d4497 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,7 +2866,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 	int err;
 	unsigned int ia_valid = attr->ia_valid;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		goto out;
 	/* We do not support NTFS ACLs yet. */
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 37c7d03a6284..9fa66cd1f622 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -196,11 +196,11 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 
 	attr->ia_valid &= ~ATTR_SIZE;
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0c75619adf54..cabf355b148f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1142,7 +1142,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
 		return 0;
 
-	status = setattr_prepare(dentry, attr);
+	status = setattr_prepare(&init_user_ns, dentry, attr);
 	if (status)
 		return status;
 
@@ -1263,7 +1263,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 2c7b70ee1388..729339cd7902 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -348,7 +348,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -361,7 +361,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 		omfs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/open.c b/fs/open.c
index a6dac6d97988..c3e4dc43dd8d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,7 +61,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 
 	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(dentry, &newattrs, NULL);
+	ret = notify_change(&init_user_ns, dentry, &newattrs, NULL);
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
@@ -580,7 +580,8 @@ retry_deleg:
 		goto out_unlock;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change(&init_user_ns, path->dentry, &newattrs,
+			      &delegated_inode);
 out_unlock:
 	inode_unlock(inode);
 	if (delegated_inode) {
@@ -671,7 +672,8 @@ retry_deleg:
 	inode_lock(inode);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
-		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+		error = notify_change(&init_user_ns, path->dentry, &newattrs,
+				      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 4c790cc8042d..8ac9491ceb9a 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -855,7 +855,7 @@ again:
 		ORANGEFS_I(inode)->attr_uid = current_fsuid();
 		ORANGEFS_I(inode)->attr_gid = current_fsgid();
 	}
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	spin_unlock(&inode->i_lock);
 	mark_inode_dirty(inode);
 
@@ -876,7 +876,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
 	int ret;
 	gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
 	    dentry);
-	ret = setattr_prepare(dentry, iattr);
+	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 	        goto out;
 	ret = __orangefs_setattr(d_inode(dentry), iattr);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index e5b616c93e11..3e9957ae19fa 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -235,7 +235,7 @@ static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat)
 		.ia_size = stat->size,
 	};
 
-	return notify_change(upperdentry, &attr, NULL);
+	return notify_change(&init_user_ns, upperdentry, &attr, NULL);
 }
 
 static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
@@ -247,7 +247,7 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
 		.ia_mtime = stat->mtime,
 	};
 
-	return notify_change(upperdentry, &attr, NULL);
+	return notify_change(&init_user_ns, upperdentry, &attr, NULL);
 }
 
 int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
@@ -259,7 +259,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
 			.ia_valid = ATTR_MODE,
 			.ia_mode = stat->mode,
 		};
-		err = notify_change(upperdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
 	}
 	if (!err) {
 		struct iattr attr = {
@@ -267,7 +267,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
 			.ia_uid = stat->uid,
 			.ia_gid = stat->gid,
 		};
-		err = notify_change(upperdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
 	}
 	if (!err)
 		ovl_set_timestamps(upperdentry, stat);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 98a23353b19a..29840820a46c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -508,7 +508,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 			.ia_mode = cattr->mode,
 		};
 		inode_lock(newdentry->d_inode);
-		err = notify_change(newdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, newdentry, &attr, NULL);
 		inode_unlock(newdentry->d_inode);
 		if (err)
 			goto out_cleanup;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c101ebbb7a77..5aa66881dbd7 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -21,7 +21,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	struct dentry *upperdentry;
 	const struct cred *old_cred;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -79,7 +79,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 
 		inode_lock(upperdentry->d_inode);
 		old_cred = ovl_override_creds(dentry->d_sb);
-		err = notify_change(upperdentry, attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, attr, NULL);
 		revert_creds(old_cred);
 		if (!err)
 			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3e925deaa19a..39b2e9aa0e5b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -804,7 +804,7 @@ retry:
 
 		/* Clear any inherited mode bits */
 		inode_lock(work->d_inode);
-		err = notify_change(work, &attr, NULL);
+		err = notify_change(&init_user_ns, work, &attr, NULL);
 		inode_unlock(work->d_inode);
 		if (err)
 			goto out_dput;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b4ec9293625e..bb4e63a3684f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -693,11 +693,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_MODE)
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6c0a05f55d6b..6d4fabab8aa7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -121,11 +121,11 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	struct proc_dir_entry *de = PDE(inode);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	proc_set_user(de, inode->i_uid, inode->i_gid);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 317899222d7f..ec67dbc1f705 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -821,11 +821,11 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 355523f4a4bf..f0358fe410d3 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -165,7 +165,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 	int ret = 0;
 
 	/* POSIX UID/GID verification for setting inode attributes */
-	ret = setattr_prepare(dentry, ia);
+	ret = setattr_prepare(&init_user_ns, dentry, ia);
 	if (ret)
 		return ret;
 
@@ -185,7 +185,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 
-	setattr_copy(inode, ia);
+	setattr_copy(&init_user_ns, inode, ia);
  out:
 	ia->ia_valid = old_ia_valid;
 	return ret;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c76d563dec0e..944f2b487cf8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3288,7 +3288,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -3413,7 +3413,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (!error) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 45fc79a18594..ca7e216b7b9e 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -34,7 +34,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -47,7 +47,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 		sysv_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2bc7780d2963..76ef392b1e41 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1265,7 +1265,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	dbg_gen("ino %lu, mode %#x, ia_valid %#x",
 		inode->i_ino, inode->i_mode, attr->ia_valid);
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 3671a40ed3c3..7c7d161315c2 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -259,7 +259,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 	struct super_block *sb = inode->i_sb;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -282,7 +282,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_MODE)
 		udf_update_extra_perms(inode, attr->ia_mode);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c843ec858cf7..6b51f3b20143 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1217,7 +1217,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -1227,7 +1227,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 			return error;
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index fd3cc4226224..4572b91ddb91 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -62,7 +62,8 @@ int vfs_utimes(const struct path *path, struct timespec64 *times)
 	}
 retry_deleg:
 	inode_lock(inode);
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change(&init_user_ns, path->dentry, &newattrs,
+			      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67c8dc9de8aa..08a478d25122 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -637,7 +637,7 @@ xfs_vn_change_ok(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	return setattr_prepare(dentry, iattr);
+	return setattr_prepare(&init_user_ns, dentry, iattr);
 }
 
 /*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 569525ee8f69..8a1f69677784 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -488,7 +488,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (unlikely(IS_IMMUTABLE(inode)))
 		return -EPERM;
 
-	ret = setattr_prepare(dentry, iattr);
+	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 		return ret;
 
@@ -516,7 +516,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 			return ret;
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a9d4af6a64d..e3ea1d7c3367 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2809,7 +2809,8 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 }
 #endif
 
-extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+int notify_change(struct user_namespace *, struct dentry *,
+		  struct iattr *, struct inode **);
 int inode_permission(struct user_namespace *, struct inode *, int);
 int generic_permission(struct user_namespace *, struct inode *, int);
 static inline int file_permission(struct file *file, int mask)
@@ -3274,9 +3275,10 @@ extern int buffer_migrate_page_norefs(struct address_space *,
 #define buffer_migrate_page_norefs NULL
 #endif
 
-extern int setattr_prepare(struct dentry *, struct iattr *);
+int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
-extern void setattr_copy(struct inode *inode, const struct iattr *attr);
+void setattr_copy(struct user_namespace *, struct inode *inode,
+		  const struct iattr *attr);
 
 extern int file_update_time(struct file *file);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c68c9edba5e..1cb451e131ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1087,7 +1087,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -1141,7 +1141,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	if (attr->ia_valid & ATTR_MODE)
 		error = posix_acl_chmod(inode, inode->i_mode);
 	return error;
-- 
cgit v1.2.3


From e65ce2a50cf6af216bea6fd80d771fcbb4c0aaa1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:27 +0100
Subject: acl: handle idmapped mounts

The posix acl permission checking helpers determine whether a caller is
privileged over an inode according to the acls associated with the
inode. Add helpers that make it possible to handle acls on idmapped
mounts.

The vfs and the filesystems targeted by this first iteration make use of
posix_acl_fix_xattr_from_user() and posix_acl_fix_xattr_to_user() to
translate basic posix access and default permissions such as the
ACL_USER and ACL_GROUP type according to the initial user namespace (or
the superblock's user namespace) to and from the caller's current user
namespace. Adapt these two helpers to handle idmapped mounts whereby we
either map from or into the mount's user namespace depending on in which
direction we're translating.
Similarly, cap_convert_nscap() is used by the vfs to translate user
namespace and non-user namespace aware filesystem capabilities from the
superblock's user namespace to the caller's user namespace. Enable it to
handle idmapped mounts by accounting for the mount's user namespace.

In addition the fileystems targeted in the first iteration of this patch
series make use of the posix_acl_chmod() and, posix_acl_update_mode()
helpers. Both helpers perform permission checks on the target inode. Let
them handle idmapped mounts. These two helpers are called when posix
acls are set by the respective filesystems to handle this case we extend
the ->set() method to take an additional user namespace argument to pass
the mount's user namespace down.

Link: https://lore.kernel.org/r/20210121131959.646623-9-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 Documentation/filesystems/locking.rst |  7 ++--
 Documentation/filesystems/porting.rst |  2 +
 fs/9p/acl.c                           |  4 +-
 fs/9p/xattr.c                         |  1 +
 fs/afs/xattr.c                        |  2 +
 fs/btrfs/acl.c                        |  3 +-
 fs/btrfs/inode.c                      |  3 +-
 fs/btrfs/xattr.c                      |  2 +
 fs/ceph/acl.c                         |  3 +-
 fs/ceph/inode.c                       |  2 +-
 fs/ceph/xattr.c                       |  1 +
 fs/cifs/xattr.c                       |  1 +
 fs/ecryptfs/inode.c                   |  1 +
 fs/ext2/acl.c                         |  3 +-
 fs/ext2/inode.c                       |  2 +-
 fs/ext2/xattr_security.c              |  1 +
 fs/ext2/xattr_trusted.c               |  1 +
 fs/ext2/xattr_user.c                  |  1 +
 fs/ext4/acl.c                         |  3 +-
 fs/ext4/inode.c                       |  2 +-
 fs/ext4/xattr_hurd.c                  |  1 +
 fs/ext4/xattr_security.c              |  1 +
 fs/ext4/xattr_trusted.c               |  1 +
 fs/ext4/xattr_user.c                  |  1 +
 fs/f2fs/acl.c                         |  3 +-
 fs/f2fs/file.c                        |  7 ++--
 fs/f2fs/xattr.c                       |  2 +
 fs/fuse/xattr.c                       |  2 +
 fs/gfs2/acl.c                         |  2 +-
 fs/gfs2/inode.c                       |  3 +-
 fs/gfs2/xattr.c                       |  1 +
 fs/hfs/attr.c                         |  1 +
 fs/hfsplus/xattr.c                    |  1 +
 fs/hfsplus/xattr_security.c           |  1 +
 fs/hfsplus/xattr_trusted.c            |  1 +
 fs/hfsplus/xattr_user.c               |  1 +
 fs/jffs2/acl.c                        |  3 +-
 fs/jffs2/fs.c                         |  2 +-
 fs/jffs2/security.c                   |  1 +
 fs/jffs2/xattr_trusted.c              |  1 +
 fs/jffs2/xattr_user.c                 |  1 +
 fs/jfs/acl.c                          |  2 +-
 fs/jfs/file.c                         |  2 +-
 fs/jfs/xattr.c                        |  2 +
 fs/kernfs/inode.c                     |  2 +
 fs/nfs/nfs4proc.c                     |  3 ++
 fs/nfsd/nfs2acl.c                     |  6 ++-
 fs/nfsd/nfs3acl.c                     |  6 ++-
 fs/nfsd/nfs4acl.c                     |  5 ++-
 fs/ocfs2/acl.c                        |  3 +-
 fs/ocfs2/xattr.c                      |  3 ++
 fs/orangefs/acl.c                     |  3 +-
 fs/orangefs/inode.c                   |  2 +-
 fs/orangefs/xattr.c                   |  1 +
 fs/overlayfs/super.c                  |  3 ++
 fs/posix_acl.c                        | 79 +++++++++++++++++++++++++----------
 fs/reiserfs/xattr_acl.c               |  5 ++-
 fs/reiserfs/xattr_security.c          |  3 +-
 fs/reiserfs/xattr_trusted.c           |  3 +-
 fs/reiserfs/xattr_user.c              |  3 +-
 fs/ubifs/xattr.c                      |  1 +
 fs/xattr.c                            | 14 ++++---
 fs/xfs/xfs_acl.c                      |  3 +-
 fs/xfs/xfs_iops.c                     |  2 +-
 fs/xfs/xfs_xattr.c                    |  7 ++--
 include/linux/capability.h            |  3 +-
 include/linux/posix_acl.h             | 11 +++--
 include/linux/posix_acl_xattr.h       | 12 ++++--
 include/linux/xattr.h                 |  3 +-
 mm/shmem.c                            |  3 +-
 net/socket.c                          |  1 +
 security/commoncap.c                  | 45 ++++++++++++++++----
 72 files changed, 238 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index c0f2c7586531..b7dcc86c92a4 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -126,9 +126,10 @@ prototypes::
 	int (*get)(const struct xattr_handler *handler, struct dentry *dentry,
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
-	int (*set)(const struct xattr_handler *handler, struct dentry *dentry,
-		   struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags);
+	int (*set)(const struct xattr_handler *handler,
+                   struct user_namespace *mnt_userns,
+                   struct dentry *dentry, struct inode *inode, const char *name,
+                   const void *buffer, size_t size, int flags);
 
 locking rules:
 	all may block
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..de1dcec3b5b8 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -717,6 +717,8 @@ be removed.  Switch while you still can; the old one won't stay.
 **mandatory**
 
 ->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+The xattr_handler.set() gets passed the user namespace of the mount the inode
+is seen from so filesystems can idmap the i_uid and i_gid accordingly.
 dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
 in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
 called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index d77b28e8d57a..1c14f18a6ec9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,6 +239,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+			      struct user_namespace *mnt_userns,
 			      struct dentry *dentry, struct inode *inode,
 			      const char *name, const void *value,
 			      size_t size, int flags)
@@ -279,7 +280,8 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			struct iattr iattr = { 0 };
 			struct posix_acl *old_acl = acl;
 
-			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+			retval = posix_acl_update_mode(mnt_userns, inode,
+						       &iattr.ia_mode, &acl);
 			if (retval)
 				goto err_out;
 			if (!acl) {
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 87217dd0433e..ee331845e2c7 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -157,6 +157,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *dentry, struct inode *inode,
 				  const char *name, const void *value,
 				  size_t size, int flags)
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 95c573dcda11..c629caae5002 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -120,6 +120,7 @@ static const struct afs_operation_ops afs_store_acl_operation = {
  * Set a file's AFS3 ACL.
  */
 static int afs_xattr_set_acl(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
                              struct dentry *dentry,
                              struct inode *inode, const char *name,
                              const void *buffer, size_t size, int flags)
@@ -248,6 +249,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = {
  * Set a file's YFS ACL.
  */
 static int afs_xattr_set_yfs(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
                              struct dentry *dentry,
                              struct inode *inode, const char *name,
                              const void *buffer, size_t size, int flags)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index a0af1b952c4d..d12a5a8730a8 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -113,7 +113,8 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	umode_t old_mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+		ret = posix_acl_update_mode(&init_user_ns, inode,
+					    &inode->i_mode, &acl);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 792191a8705b..6c18fb1a25af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5070,7 +5070,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = posix_acl_chmod(inode, inode->i_mode);
+			err = posix_acl_chmod(&init_user_ns, inode,
+					      inode->i_mode);
 	}
 
 	return err;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index af6246f36a9e..b025102e435f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -362,6 +362,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *buffer,
 				   size_t size, int flags)
@@ -371,6 +372,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+					struct user_namespace *mnt_userns,
 					struct dentry *unused, struct inode *inode,
 					const char *name, const void *value,
 					size_t size, int flags)
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index e0465741c591..52a01ddbc4ac 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -100,7 +100,8 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
-			ret = posix_acl_update_mode(inode, &new_mode, &acl);
+			ret = posix_acl_update_mode(&init_user_ns, inode,
+						    &new_mode, &acl);
 			if (ret)
 				goto out;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 285d3baca27e..145e26a4ddbb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2262,7 +2262,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	err = __ceph_setattr(inode, attr);
 
 	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
-		err = posix_acl_chmod(inode, attr->ia_mode);
+		err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
 
 	return err;
 }
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 24997982de01..02f59bcb4f27 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1238,6 +1238,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *unused, struct inode *inode,
 				  const char *name, const void *value,
 				  size_t size, int flags)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 6b658a1172ef..41a611e76bb7 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -101,6 +101,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon,
 }
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *dentry, struct inode *inode,
 			  const char *name, const void *value,
 			  size_t size, int flags)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d3ea0c57b075..ac6472a82567 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1133,6 +1133,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ecryptfs_xattr_set(const struct xattr_handler *handler,
+			      struct user_namespace *mnt_userns,
 			      struct dentry *dentry, struct inode *inode,
 			      const char *name, const void *value, size_t size,
 			      int flags)
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index cf4c77f8dd08..9031f7df2d48 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -223,7 +223,8 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	umode_t mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			return error;
 		update_mode = 1;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 06c0cf28c1a0..9de813635d8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1691,7 +1691,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 	setattr_copy(&init_user_ns, inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	mark_inode_dirty(inode);
 
 	return error;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 9a682e440acb..ebade1f52451 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -19,6 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *unused, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 49add1107850..18a87d5dd1ab 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -26,6 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct user_namespace *mnt_userns,
 		       struct dentry *unused, struct inode *inode,
 		       const char *name, const void *value,
 		       size_t size, int flags)
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index c243a3b4d69d..58092449f8ff 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -30,6 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 68aaed48315f..7b0fb66bc04d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -245,7 +245,8 @@ retry:
 	ext4_fc_start_update(inode);
 
 	if ((type == ACL_TYPE_ACCESS) && acl) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			goto out_stop;
 		if (mode != inode->i_mode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8edfa3e226e6..24ea5851e90a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5524,7 +5524,7 @@ out_mmap_sem:
 		ext4_orphan_del(NULL, inode);
 
 	if (!error && (ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 err_out:
 	if  (error)
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
index 8cfa74a56361..c78df5790377 100644
--- a/fs/ext4/xattr_hurd.c
+++ b/fs/ext4/xattr_hurd.c
@@ -32,6 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_hurd_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 197a9d8a15ef..8213f66f7b2d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *unused, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e9389e5d75c3..7c21ffb26d25 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct user_namespace *mnt_userns,
 		       struct dentry *unused, struct inode *inode,
 		       const char *name, const void *value,
 		       size_t size, int flags)
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d4546184b34b..2fe7ff0a479c 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1e5e9b1136ee..6a95bf28f602 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -213,7 +213,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl && !ipage) {
-			error = posix_acl_update_mode(inode, &mode, &acl);
+			error = posix_acl_update_mode(&init_user_ns, inode,
+						      &mode, &acl);
 			if (error)
 				return error;
 			set_acl_inode(inode, mode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 90d7b89176de..6ccdfe0606d9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -831,8 +831,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 }
 
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-static void __setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
-			   const struct iattr *attr)
+static void __setattr_copy(struct user_namespace *mnt_userns,
+			   struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -950,7 +950,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	__setattr_copy(&init_user_ns, inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
+		err = posix_acl_chmod(&init_user_ns, inode,
+				      f2fs_get_inode_mode(inode));
 		if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
 			inode->i_mode = F2FS_I(inode)->i_acl_mode;
 			clear_inode_flag(inode, FI_ACL_MODE);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index d772bf13a814..10081bf74324 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -64,6 +64,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+		struct user_namespace *mnt_userns,
 		struct dentry *unused, struct inode *inode,
 		const char *name, const void *value,
 		size_t size, int flags)
@@ -107,6 +108,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+		struct user_namespace *mnt_userns,
 		struct dentry *unused, struct inode *inode,
 		const char *name, const void *value,
 		size_t size, int flags)
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index cdea18de94f7..1a7d7ace54e1 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -188,6 +188,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler,
 }
 
 static int fuse_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *dentry, struct inode *inode,
 			  const char *name, const void *value, size_t size,
 			  int flags)
@@ -214,6 +215,7 @@ static int no_xattr_get(const struct xattr_handler *handler,
 }
 
 static int no_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *dentry, struct inode *nodee,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 2e939f5fe751..ce88ef29eef0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -130,7 +130,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	mode = inode->i_mode;
 	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(inode, &mode, &acl);
+		ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl);
 		if (ret)
 			goto unlock;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 59c25181d108..728405d15a05 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1993,7 +1993,8 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	else {
 		error = gfs2_setattr_simple(inode, attr);
 		if (!error && attr->ia_valid & ATTR_MODE)
-			error = posix_acl_chmod(inode, inode->i_mode);
+			error = posix_acl_chmod(&init_user_ns, inode,
+						inode->i_mode);
 	}
 
 error:
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 9d7667bc4292..13969a813410 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1214,6 +1214,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *unused, struct inode *inode,
 			  const char *name, const void *value,
 			  size_t size, int flags)
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 74fa62643136..2bd54efaf416 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -121,6 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int hfs_xattr_set(const struct xattr_handler *handler,
+			 struct user_namespace *mnt_userns,
 			 struct dentry *unused, struct inode *inode,
 			 const char *name, const void *value, size_t size,
 			 int flags)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bb0b27d88e50..4d169c5a2673 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -858,6 +858,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *name, const void *buffer,
 				size_t size, int flags)
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index cfbe6a3bfb1e..c1c7a16cbf21 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,6 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *name, const void *buffer,
 				     size_t size, int flags)
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index fbad91e1dada..e150372ec564 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -22,6 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *name, const void *buffer,
 				    size_t size, int flags)
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 74d19faf255e..a6b60b153916 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -22,6 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+				 struct user_namespace *mnt_userns,
 				 struct dentry *unused, struct inode *inode,
 				 const char *name, const void *buffer,
 				 size_t size, int flags)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 093ffbd82395..5f27ac593479 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -236,7 +236,8 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		if (acl) {
 			umode_t mode;
 
-			rc = posix_acl_update_mode(inode, &mode, &acl);
+			rc = posix_acl_update_mode(&init_user_ns, inode, &mode,
+						   &acl);
 			if (rc)
 				return rc;
 			if (inode->i_mode != mode) {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 67993808f4da..ee9f51bab4c6 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -201,7 +201,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	rc = jffs2_do_setattr(inode, iattr);
 	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 	return rc;
 }
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index c2332e30f218..aef5522551db 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,6 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *buffer,
 				   size_t size, int flags)
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 5d6030826c52..cc3f24883e7d 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,6 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *unused, struct inode *inode,
 				  const char *name, const void *buffer,
 				  size_t size, int flags)
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 9d027b4abcf9..fb945977c013 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,6 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
 			       struct dentry *unused, struct inode *inode,
 			       const char *name, const void *buffer,
 			       size_t size, int flags)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 92cc0ac2d1fc..cf79a34bfada 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -101,7 +101,7 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	tid = txBegin(inode->i_sb, 0);
 	mutex_lock(&JFS_IP(inode)->commit_mutex);
 	if (type == ACL_TYPE_ACCESS && acl) {
-		rc = posix_acl_update_mode(inode, &mode, &acl);
+		rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl);
 		if (rc)
 			goto end_tx;
 		if (mode != inode->i_mode)
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index ff49876e9c9b..61c3b0c1fbf6 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -122,7 +122,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	return rc;
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index db41e7803163..f9273f6901c8 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -932,6 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
+			 struct user_namespace *mnt_userns,
 			 struct dentry *unused, struct inode *inode,
 			 const char *name, const void *value,
 			 size_t size, int flags)
@@ -950,6 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
 			     struct dentry *unused, struct inode *inode,
 			     const char *name, const void *value,
 			     size_t size, int flags)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 86bd4c593b78..7e44052b42e1 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -319,6 +319,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *suffix, const void *value,
 				size_t size, int flags)
@@ -385,6 +386,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
 }
 
 static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *suffix, const void *value,
 				     size_t size, int flags)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2f4679a62712..a07530cf673d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7491,6 +7491,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *key, const void *buf,
 				   size_t buflen, int flags)
@@ -7513,6 +7514,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *key, const void *buf,
 				     size_t buflen, int flags)
@@ -7563,6 +7565,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
 
 #ifdef CONFIG_NFS_V4_2
 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *key, const void *buf,
 				    size_t buflen, int flags)
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b0f66604532a..b83f222558e3 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -113,10 +113,12 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
 
 	fh_lock(fh);
 
-	error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+			      argp->acl_default);
 	if (error)
 		goto out_drop_lock;
 
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 7c30876a31a1..f18ec7e8094d 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -103,10 +103,12 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
 
 	fh_lock(fh);
 
-	error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+			      argp->acl_default);
 
 out_drop_lock:
 	fh_unlock(fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 71292a0d6f09..eaa3a0cf38f1 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -781,12 +781,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	fh_lock(fhp);
 
-	host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl);
+	host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl);
 	if (host_error < 0)
 		goto out_drop_lock;
 
 	if (S_ISDIR(inode->i_mode)) {
-		host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl);
+		host_error = set_posix_acl(&init_user_ns, inode,
+					   ACL_TYPE_DEFAULT, dpacl);
 	}
 
 out_drop_lock:
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7b07f5df3a29..990756cee4bd 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -274,7 +274,8 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (type == ACL_TYPE_ACCESS && acl) {
 		umode_t mode;
 
-		status = posix_acl_update_mode(inode, &mode, &acl);
+		status = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					       &acl);
 		if (status)
 			goto unlock;
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9ccd19d8f7b1..36ae47a4aef6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7249,6 +7249,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *name, const void *value,
 				    size_t size, int flags)
@@ -7321,6 +7322,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *value,
 				   size_t size, int flags)
@@ -7351,6 +7353,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *name, const void *value,
 				size_t size, int flags)
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index a25e6c890975..628921952d16 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -132,7 +132,8 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		 * and "mode" to the new desired value. It is up to
 		 * us to propagate the new mode back to the server...
 		 */
-		error = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode,
+					      &iattr.ia_mode, &acl);
 		if (error) {
 			gossip_err("%s: posix_acl_update_mode err: %d\n",
 				   __func__,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 8ac9491ceb9a..563fe9ab8eb2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -861,7 +861,7 @@ again:
 
 	if (iattr->ia_valid & ATTR_MODE)
 		/* change mod on a file that has ACLs */
-		ret = posix_acl_chmod(inode, inode->i_mode);
+		ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 	ret = 0;
 out:
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index bdc285aea360..9a5b757fbd2f 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -526,6 +526,7 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
+				      struct user_namespace *mnt_userns,
 				      struct dentry *unused,
 				      struct inode *inode,
 				      const char *name,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 39b2e9aa0e5b..e24c995c5fd4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -980,6 +980,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int __maybe_unused
 ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *dentry, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
@@ -1044,6 +1045,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ovl_own_xattr_set(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
 			     struct dentry *dentry, struct inode *inode,
 			     const char *name, const void *value,
 			     size_t size, int flags)
@@ -1059,6 +1061,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ovl_other_xattr_set(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
 			       struct dentry *dentry, struct inode *inode,
 			       const char *name, const void *value,
 			       size_t size, int flags)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 9ce8214bfdac..d31b60f5d40d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -558,8 +558,22 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 }
 EXPORT_SYMBOL(__posix_acl_chmod);
 
+/**
+ * posix_acl_chmod - chmod a posix acl
+ *
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @mode:	the new mode of @inode
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
 int
-posix_acl_chmod(struct inode *inode, umode_t mode)
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+		    umode_t mode)
 {
 	struct posix_acl *acl;
 	int ret = 0;
@@ -638,9 +652,10 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
 
 /**
  * posix_acl_update_mode  -  update mode in set_acl
- * @inode: target inode
- * @mode_p: mode (pointer) for update
- * @acl: acl pointer
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	target inode
+ * @mode_p:	mode (pointer) for update
+ * @acl:	acl pointer
  *
  * Update the file mode when setting an ACL: compute the new file permission
  * bits based on the ACL.  In addition, if the ACL is equivalent to the new
@@ -649,9 +664,16 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
  * As with chmod, clear the setgid bit if the caller is not in the owning group
  * or capable of CAP_FSETID (see inode_change_ok).
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Called from set_acl inode operations.
  */
-int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+int posix_acl_update_mode(struct user_namespace *mnt_userns,
+			  struct inode *inode, umode_t *mode_p,
 			  struct posix_acl **acl)
 {
 	umode_t mode = inode->i_mode;
@@ -662,8 +684,8 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
 		return error;
 	if (error == 0)
 		*acl = NULL;
-	if (!in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+	if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+	    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 		mode &= ~S_ISGID;
 	*mode_p = mode;
 	return 0;
@@ -675,7 +697,8 @@ EXPORT_SYMBOL(posix_acl_update_mode);
  */
 static void posix_acl_fix_xattr_userns(
 	struct user_namespace *to, struct user_namespace *from,
-	void *value, size_t size)
+	struct user_namespace *mnt_userns,
+	void *value, size_t size, bool from_user)
 {
 	struct posix_acl_xattr_header *header = value;
 	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
@@ -700,10 +723,18 @@ static void posix_acl_fix_xattr_userns(
 		switch(le16_to_cpu(entry->e_tag)) {
 		case ACL_USER:
 			uid = make_kuid(from, le32_to_cpu(entry->e_id));
+			if (from_user)
+				uid = kuid_from_mnt(mnt_userns, uid);
+			else
+				uid = kuid_into_mnt(mnt_userns, uid);
 			entry->e_id = cpu_to_le32(from_kuid(to, uid));
 			break;
 		case ACL_GROUP:
 			gid = make_kgid(from, le32_to_cpu(entry->e_id));
+			if (from_user)
+				gid = kgid_from_mnt(mnt_userns, gid);
+			else
+				gid = kgid_into_mnt(mnt_userns, gid);
 			entry->e_id = cpu_to_le32(from_kgid(to, gid));
 			break;
 		default:
@@ -712,20 +743,24 @@ static void posix_acl_fix_xattr_userns(
 	}
 }
 
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
+void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+				   void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
+	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
 		return;
-	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
+				   size, true);
 }
 
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
+void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+				 void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
+	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
 		return;
-	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
+				   size, false);
 }
 
 /*
@@ -865,7 +900,8 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 }
 
 int
-set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
+set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	      int type, struct posix_acl *acl)
 {
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
@@ -874,7 +910,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return acl ? -EACCES : 0;
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	if (acl) {
@@ -888,9 +924,10 @@ EXPORT_SYMBOL(set_posix_acl);
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-		    struct dentry *unused, struct inode *inode,
-		    const char *name, const void *value,
-		    size_t size, int flags)
+			   struct user_namespace *mnt_userns,
+			   struct dentry *unused, struct inode *inode,
+			   const char *name, const void *value, size_t size,
+			   int flags)
 {
 	struct posix_acl *acl = NULL;
 	int ret;
@@ -900,7 +937,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 	}
-	ret = set_posix_acl(inode, handler->flags, acl);
+	ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -934,7 +971,7 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(inode,
+		error = posix_acl_update_mode(&init_user_ns, inode,
 				&inode->i_mode, &acl);
 		if (error)
 			return error;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index ccd40df6eb45..4bf976bc7bad 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -40,7 +40,8 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	reiserfs_write_unlock(inode->i_sb);
 	if (error == 0) {
 		if (type == ACL_TYPE_ACCESS && acl) {
-			error = posix_acl_update_mode(inode, &mode, &acl);
+			error = posix_acl_update_mode(&init_user_ns, inode,
+						      &mode, &acl);
 			if (error)
 				goto unlock;
 			update_mode = 1;
@@ -399,5 +400,5 @@ int reiserfs_acl_chmod(struct inode *inode)
 	    !reiserfs_posixacl(inode->i_sb))
 		return 0;
 
-	return posix_acl_chmod(inode, inode->i_mode);
+	return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 }
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 20be9a0e5870..8965c8e5e172 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -21,7 +21,8 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *unused,
+security_set(const struct xattr_handler *handler,
+	     struct user_namespace *mnt_userns, struct dentry *unused,
 	     struct inode *inode, const char *name, const void *buffer,
 	     size_t size, int flags)
 {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5ed48da3d02b..d853cea2afcd 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -20,7 +20,8 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+trusted_set(const struct xattr_handler *handler,
+	    struct user_namespace *mnt_userns, struct dentry *unused,
 	    struct inode *inode, const char *name, const void *buffer,
 	    size_t size, int flags)
 {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index a573ca45bacc..65d9cd10a5ea 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -18,7 +18,8 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *unused,
+user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns,
+	 struct dentry *unused,
 	 struct inode *inode, const char *name, const void *buffer,
 	 size_t size, int flags)
 {
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index a0b9b349efe6..8f4135c22ab6 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -681,6 +681,7 @@ static int xattr_get(const struct xattr_handler *handler,
 }
 
 static int xattr_set(const struct xattr_handler *handler,
+			   struct user_namespace *mnt_userns,
 			   struct dentry *dentry, struct inode *inode,
 			   const char *name, const void *value,
 			   size_t size, int flags)
diff --git a/fs/xattr.c b/fs/xattr.c
index c669922e1bde..d777025121e0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,8 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 		return -EOPNOTSUPP;
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
-	return handler->set(handler, dentry, inode, name, value, size, flags);
+	return handler->set(handler, &init_user_ns, dentry, inode, name, value,
+			    size, flags);
 }
 EXPORT_SYMBOL(__vfs_setxattr);
 
@@ -281,7 +282,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(dentry, &value, size);
+		error = cap_convert_nscap(&init_user_ns, dentry, &value, size);
 		if (error < 0)
 			return error;
 		size = error;
@@ -450,7 +451,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
 		return PTR_ERR(handler);
 	if (!handler->set)
 		return -EOPNOTSUPP;
-	return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE);
+	return handler->set(handler, &init_user_ns, dentry, inode, name, NULL,
+			    0, XATTR_REPLACE);
 }
 EXPORT_SYMBOL(__vfs_removexattr);
 
@@ -548,7 +550,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		}
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_from_user(kvalue, size);
+			posix_acl_fix_xattr_from_user(&init_user_ns, kvalue,
+						      size);
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
@@ -642,7 +645,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_to_user(kvalue, error);
+			posix_acl_fix_xattr_to_user(&init_user_ns, kvalue,
+						    error);
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 779cb73b3d00..368351298bd5 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -252,7 +252,8 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		return error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			return error;
 		set_mode = true;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 08a478d25122..26d22edef741 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -807,7 +807,7 @@ xfs_setattr_nonsize(
 	 * 	     Posix ACL code seems to care about this issue either.
 	 */
 	if (mask & ATTR_MODE) {
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index bca48b308c02..12be32f66dc1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -38,9 +38,10 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
-		struct inode *inode, const char *name, const void *value,
-		size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler,
+	      struct user_namespace *mnt_userns, struct dentry *unused,
+	      struct inode *inode, const char *name, const void *value,
+	      size_t size, int flags)
 {
 	struct xfs_da_args	args = {
 		.dp		= XFS_I(inode),
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 62bfa3ed84d7..da21ef118b04 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -273,6 +273,7 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
-extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
+int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const void **ivalue, size_t size);
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 85fb4c0c650a..6dcd8b8f6ab5 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -69,13 +69,15 @@ extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
 extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 
 extern struct posix_acl *get_posix_acl(struct inode *, int);
-extern int set_posix_acl(struct inode *, int, struct posix_acl *);
+extern int set_posix_acl(struct user_namespace *, struct inode *, int,
+			 struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
-extern int posix_acl_chmod(struct inode *, umode_t);
+int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 		struct posix_acl **);
-extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);
+int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *,
+			  struct posix_acl **);
 
 extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
@@ -95,7 +97,8 @@ static inline void cache_no_acl(struct inode *inode)
 	inode->i_default_acl = NULL;
 }
 #else
-static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
+static inline int posix_acl_chmod(struct user_namespace *mnt_userns,
+				  struct inode *inode, umode_t mode)
 {
 	return 0;
 }
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 2387709991b5..060e8d203181 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -33,13 +33,17 @@ posix_acl_xattr_count(size_t size)
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
-void posix_acl_fix_xattr_from_user(void *value, size_t size);
-void posix_acl_fix_xattr_to_user(void *value, size_t size);
+void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+				   void *value, size_t size);
+void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+				 void *value, size_t size);
 #else
-static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
+static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+						 void *value, size_t size)
 {
 }
-static inline void posix_acl_fix_xattr_to_user(void *value, size_t size)
+static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+					       void *value, size_t size)
 {
 }
 #endif
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 10b4dc2709f0..260c9bcb0edb 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -34,7 +34,8 @@ struct xattr_handler {
 	int (*get)(const struct xattr_handler *, struct dentry *dentry,
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
-	int (*set)(const struct xattr_handler *, struct dentry *dentry,
+	int (*set)(const struct xattr_handler *,
+		   struct user_namespace *mnt_userns, struct dentry *dentry,
 		   struct inode *inode, const char *name, const void *buffer,
 		   size_t size, int flags);
 };
diff --git a/mm/shmem.c b/mm/shmem.c
index 1cb451e131ec..23b8e9c15a42 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1143,7 +1143,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 
 	setattr_copy(&init_user_ns, inode, attr);
 	if (attr->ia_valid & ATTR_MODE)
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	return error;
 }
 
@@ -3273,6 +3273,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *value,
 				   size_t size, int flags)
diff --git a/net/socket.c b/net/socket.c
index 33e8b6c4e1d3..c76703c6f480 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -334,6 +334,7 @@ static const struct xattr_handler sockfs_xattr_handler = {
 };
 
 static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *dentry, struct inode *inode,
 				     const char *suffix, const void *value,
 				     size_t size, int flags)
diff --git a/security/commoncap.c b/security/commoncap.c
index 88ee345f7bfa..c3fd9b86ea9a 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -450,16 +450,33 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 	return size;
 }
 
+/**
+ * rootid_from_xattr - translate root uid of vfs caps
+ *
+ * @value:	vfs caps value which may be modified by this function
+ * @size:	size of @ivalue
+ * @task_ns:	user namespace of the caller
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
 static kuid_t rootid_from_xattr(const void *value, size_t size,
-				struct user_namespace *task_ns)
+				struct user_namespace *task_ns,
+				struct user_namespace *mnt_userns)
 {
 	const struct vfs_ns_cap_data *nscap = value;
+	kuid_t rootkid;
 	uid_t rootid = 0;
 
 	if (size == XATTR_CAPS_SZ_3)
 		rootid = le32_to_cpu(nscap->rootid);
 
-	return make_kuid(task_ns, rootid);
+	rootkid = make_kuid(task_ns, rootid);
+	return kuid_from_mnt(mnt_userns, rootkid);
 }
 
 static bool validheader(size_t size, const struct vfs_cap_data *cap)
@@ -467,13 +484,27 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap)
 	return is_v2header(size, cap) || is_v3header(size, cap);
 }
 
-/*
+/**
+ * cap_convert_nscap - check vfs caps
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	used to retrieve inode to check permissions on
+ * @ivalue:	vfs caps value which may be modified by this function
+ * @size:	size of @ivalue
+ *
  * User requested a write of security.capability.  If needed, update the
  * xattr to change from v2 to v3, or to fixup the v3 rootid.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * If all is ok, we return the new size, on error return < 0.
  */
-int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
+int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const void **ivalue, size_t size)
 {
 	struct vfs_ns_cap_data *nscap;
 	uid_t nsrootid;
@@ -489,14 +520,14 @@ int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 		return -EINVAL;
 	if (!validheader(size, cap))
 		return -EINVAL;
-	if (!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_SETFCAP))
+	if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
 		return -EPERM;
-	if (size == XATTR_CAPS_SZ_2)
+	if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns))
 		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
 			/* user is privileged, just write the v2 */
 			return size;
 
-	rootid = rootid_from_xattr(*ivalue, size, task_ns);
+	rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns);
 	if (!uid_valid(rootid))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From c7c7a1a18af4c3bb7749d33e3df3acdf0a95bbb5 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.pizza>
Date: Thu, 21 Jan 2021 14:19:28 +0100
Subject: xattr: handle idmapped mounts

When interacting with extended attributes the vfs verifies that the
caller is privileged over the inode with which the extended attribute is
associated. For posix access and posix default extended attributes a uid
or gid can be stored on-disk. Let the functions handle posix extended
attributes on idmapped mounts. If the inode is accessed through an
idmapped mount we need to map it according to the mount's user
namespace. Afterwards the checks are identical to non-idmapped mounts.
This has no effect for e.g. security xattrs since they don't store uids
or gids and don't perform permission checks on them like posix acls do.

Link: https://lore.kernel.org/r/20210121131959.646623-10-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/cachefiles/xattr.c                 |  29 ++++----
 fs/ecryptfs/crypto.c                  |   4 +-
 fs/ecryptfs/inode.c                   |   5 +-
 fs/ecryptfs/mmap.c                    |   4 +-
 fs/nfsd/vfs.c                         |  14 ++--
 fs/overlayfs/copy_up.c                |  14 ++--
 fs/overlayfs/dir.c                    |   2 +-
 fs/overlayfs/inode.c                  |   9 +--
 fs/overlayfs/overlayfs.h              |   6 +-
 fs/overlayfs/super.c                  |   6 +-
 fs/xattr.c                            | 120 +++++++++++++++++++---------------
 include/linux/xattr.h                 |  27 +++++---
 security/apparmor/domain.c            |   4 +-
 security/commoncap.c                  |   6 +-
 security/integrity/evm/evm_crypto.c   |  11 ++--
 security/integrity/evm/evm_main.c     |   4 +-
 security/integrity/ima/ima_appraise.c |   8 +--
 security/selinux/hooks.c              |   3 +-
 security/smack/smack_lsm.c            |   8 ++-
 19 files changed, 160 insertions(+), 124 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 72e42438f3d7..a591b5e09637 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -39,8 +39,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	_enter("%p{%s}", object, type);
 
 	/* attempt to install a type label directly */
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
-			   XATTR_CREATE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type,
+			   2, XATTR_CREATE);
 	if (ret == 0) {
 		_debug("SET"); /* we succeeded */
 		goto error;
@@ -54,7 +54,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	}
 
 	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+	ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype,
+			   3);
 	if (ret < 0) {
 		if (ret == -ERANGE)
 			goto bad_type_length;
@@ -110,9 +111,8 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
 	_debug("SET #%u", auxdata->len);
 
 	clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_CREATE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len, XATTR_CREATE);
 	if (ret < 0 && ret != -ENOMEM)
 		cachefiles_io_error_obj(
 			object,
@@ -140,9 +140,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 	_debug("SET #%u", auxdata->len);
 
 	clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_REPLACE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len, XATTR_REPLACE);
 	if (ret < 0 && ret != -ENOMEM)
 		cachefiles_io_error_obj(
 			object,
@@ -171,7 +170,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
 	if (!auxbuf)
 		return -ENOMEM;
 
-	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+	xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
 			    &auxbuf->type, 512 + 1);
 	ret = -ESTALE;
 	if (xlen < 1 ||
@@ -213,7 +212,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 	}
 
 	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+	ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
 			   &auxbuf->type, 512 + 1);
 	if (ret < 0) {
 		if (ret == -ENODATA)
@@ -270,9 +269,9 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 		}
 
 		/* update the current label */
-		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-				   &auxdata->type, auxdata->len,
-				   XATTR_REPLACE);
+		ret = vfs_setxattr(&init_user_ns, dentry,
+				   cachefiles_xattr_cache, &auxdata->type,
+				   auxdata->len, XATTR_REPLACE);
 		if (ret < 0) {
 			cachefiles_io_error_obj(object,
 						"Can't update xattr on %lu"
@@ -309,7 +308,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 {
 	int ret;
 
-	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+	ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache);
 	if (ret < 0) {
 		if (ret == -ENOENT || ret == -ENODATA)
 			ret = 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 0681540c48d9..943e523f4c9d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1110,8 +1110,8 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
 	}
 
 	inode_lock(lower_inode);
-	rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
-			    page_virt, size, 0);
+	rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode,
+			    ECRYPTFS_XATTR_NAME, page_virt, size, 0);
 	if (!rc && ecryptfs_inode)
 		fsstack_copy_attr_all(ecryptfs_inode, lower_inode);
 	inode_unlock(lower_inode);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ac6472a82567..b9ccc4085d46 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1024,7 +1024,8 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+	rc = vfs_setxattr(&init_user_ns, lower_dentry, name, value, size,
+			  flags);
 	if (!rc && inode)
 		fsstack_copy_attr_all(inode, d_inode(lower_dentry));
 out:
@@ -1089,7 +1090,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode,
 		goto out;
 	}
 	inode_lock(lower_inode);
-	rc = __vfs_removexattr(lower_dentry, name);
+	rc = __vfs_removexattr(&init_user_ns, lower_dentry, name);
 	inode_unlock(lower_inode);
 out:
 	return rc;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 019572c6b39a..2f333a40ff4d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -426,8 +426,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	if (size < 0)
 		size = 8;
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-	rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
-			    xattr_virt, size, 0);
+	rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode,
+			    ECRYPTFS_XATTR_NAME, xattr_virt, size, 0);
 	inode_unlock(lower_inode);
 	if (rc)
 		printk(KERN_ERR "Error whilst attempting to write inode size "
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1905b39be1c2..37d85046b4d6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -499,7 +499,8 @@ int nfsd4_is_junction(struct dentry *dentry)
 		return 0;
 	if (!(inode->i_mode & S_ISVTX))
 		return 0;
-	if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+	if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME,
+			 NULL, 0) <= 0)
 		return 0;
 	return 1;
 }
@@ -2149,7 +2150,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 
 	inode_lock_shared(inode);
 
-	len = vfs_getxattr(dentry, name, NULL, 0);
+	len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
 
 	/*
 	 * Zero-length attribute, just return.
@@ -2176,7 +2177,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 		goto out;
 	}
 
-	len = vfs_getxattr(dentry, name, buf, len);
+	len = vfs_getxattr(&init_user_ns, dentry, name, buf, len);
 	if (len <= 0) {
 		kvfree(buf);
 		buf = NULL;
@@ -2283,7 +2284,8 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
 
 	fh_lock(fhp);
 
-	ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+	ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
+				       name, NULL);
 
 	fh_unlock(fhp);
 	fh_drop_write(fhp);
@@ -2307,8 +2309,8 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 		return nfserrno(ret);
 	fh_lock(fhp);
 
-	ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
-				    NULL);
+	ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
+				    len, flags, NULL);
 
 	fh_unlock(fhp);
 	fh_drop_write(fhp);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 3e9957ae19fa..f81b836c2256 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -85,9 +85,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
 		if (ovl_is_private_xattr(sb, name))
 			continue;
 retry:
-		size = vfs_getxattr(old, name, value, value_size);
+		size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
 		if (size == -ERANGE)
-			size = vfs_getxattr(old, name, NULL, 0);
+			size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
 
 		if (size < 0) {
 			error = size;
@@ -114,7 +114,7 @@ retry:
 			error = 0;
 			continue; /* Discard */
 		}
-		error = vfs_setxattr(new, name, value, size, 0);
+		error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
 		if (error) {
 			if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
 				break;
@@ -795,7 +795,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
 	ssize_t res;
 	char *buf;
 
-	res = vfs_getxattr(dentry, name, NULL, 0);
+	res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
 	if (res == -ENODATA || res == -EOPNOTSUPP)
 		res = 0;
 
@@ -804,7 +804,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
 		if (!buf)
 			return -ENOMEM;
 
-		res = vfs_getxattr(dentry, name, buf, res);
+		res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
 		if (res < 0)
 			kfree(buf);
 		else
@@ -846,8 +846,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 	 * don't want that to happen for normal copy-up operation.
 	 */
 	if (capability) {
-		err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
-				   capability, cap_size, 0);
+		err = vfs_setxattr(&init_user_ns, upperpath.dentry,
+				   XATTR_NAME_CAPS, capability, cap_size, 0);
 		if (err)
 			goto out_free;
 	}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 29840820a46c..d75c96cb18c3 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -449,7 +449,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
 	if (err < 0)
 		goto out_free;
 
-	err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+	err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
 out_free:
 	kfree(buffer);
 	return err;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 5aa66881dbd7..023fde466e3a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -352,7 +352,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 		goto out;
 
 	if (!value && !upperdentry) {
-		err = vfs_getxattr(realdentry, name, NULL, 0);
+		err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
 		if (err < 0)
 			goto out_drop_write;
 	}
@@ -367,10 +367,11 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 
 	old_cred = ovl_override_creds(dentry->d_sb);
 	if (value)
-		err = vfs_setxattr(realdentry, name, value, size, flags);
+		err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
+				   flags);
 	else {
 		WARN_ON(flags != XATTR_REPLACE);
-		err = vfs_removexattr(realdentry, name);
+		err = vfs_removexattr(&init_user_ns, realdentry, name);
 	}
 	revert_creds(old_cred);
 
@@ -392,7 +393,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
 		ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
 
 	old_cred = ovl_override_creds(dentry->d_sb);
-	res = vfs_getxattr(realdentry, name, value, size);
+	res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
 	revert_creds(old_cred);
 	return res;
 }
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b487e48c7fd4..0002834f664a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -186,7 +186,7 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				      size_t size)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	return vfs_getxattr(dentry, name, value, size);
+	return vfs_getxattr(&init_user_ns, dentry, name, value, size);
 }
 
 static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
@@ -194,7 +194,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				  size_t size)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	int err = vfs_setxattr(dentry, name, value, size, 0);
+	int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
 	pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
 		 dentry, name, min((int)size, 48), value, size, err);
 	return err;
@@ -204,7 +204,7 @@ static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
 				     enum ovl_xattr ox)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	int err = vfs_removexattr(dentry, name);
+	int err = vfs_removexattr(&init_user_ns, dentry, name);
 	pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
 	return err;
 }
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e24c995c5fd4..8168ab2dda11 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -794,11 +794,13 @@ retry:
 		 * allowed as upper are limited to "normal" ones, where checking
 		 * for the above two errors is sufficient.
 		 */
-		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+		err = vfs_removexattr(&init_user_ns, work,
+				      XATTR_NAME_POSIX_ACL_DEFAULT);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
-		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+		err = vfs_removexattr(&init_user_ns, work,
+				      XATTR_NAME_POSIX_ACL_ACCESS);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index d777025121e0..a49541713b11 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -83,7 +83,8 @@ xattr_resolve_name(struct inode *inode, const char **name)
  * because different namespaces have very different rules.
  */
 static int
-xattr_permission(struct inode *inode, const char *name, int mask)
+xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		 const char *name, int mask)
 {
 	/*
 	 * We can never set or remove an extended attribute on a read-only
@@ -128,11 +129,11 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
 		    (mask & MAY_WRITE) &&
-		    !inode_owner_or_capable(&init_user_ns, inode))
+		    !inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 	}
 
-	return inode_permission(&init_user_ns, inode, mask);
+	return inode_permission(mnt_userns, inode, mask);
 }
 
 /*
@@ -163,8 +164,9 @@ xattr_supported_namespace(struct inode *inode, const char *prefix)
 EXPORT_SYMBOL(xattr_supported_namespace);
 
 int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-	       const void *value, size_t size, int flags)
+__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	       struct inode *inode, const char *name, const void *value,
+	       size_t size, int flags)
 {
 	const struct xattr_handler *handler;
 
@@ -175,7 +177,7 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 		return -EOPNOTSUPP;
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
-	return handler->set(handler, &init_user_ns, dentry, inode, name, value,
+	return handler->set(handler, mnt_userns, dentry, inode, name, value,
 			    size, flags);
 }
 EXPORT_SYMBOL(__vfs_setxattr);
@@ -184,6 +186,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  __vfs_setxattr_noperm - perform setxattr operation without performing
  *  permission checks.
  *
+ *  @mnt_userns - user namespace of the mount the inode was found from
  *  @dentry - object to perform setxattr on
  *  @name - xattr name to set
  *  @value - value to set @name to
@@ -196,8 +199,9 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  is executed. It also assumes that the caller will make the appropriate
  *  permission checks.
  */
-int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+int __vfs_setxattr_noperm(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name,
+			  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	int error = -EAGAIN;
@@ -207,7 +211,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_opflags & IOP_XATTR) {
-		error = __vfs_setxattr(dentry, inode, name, value, size, flags);
+		error = __vfs_setxattr(mnt_userns, dentry, inode, name, value,
+				       size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
 			security_inode_post_setxattr(dentry, name, value,
@@ -246,14 +251,14 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
  *  a delegation was broken on, NULL if none.
  */
 int
-__vfs_setxattr_locked(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags,
-		struct inode **delegated_inode)
+__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const char *name, const void *value, size_t size,
+		      int flags, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+	error = xattr_permission(mnt_userns, inode, name, MAY_WRITE);
 	if (error)
 		return error;
 
@@ -265,7 +270,8 @@ __vfs_setxattr_locked(struct dentry *dentry, const char *name,
 	if (error)
 		goto out;
 
-	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+	error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value,
+				      size, flags);
 
 out:
 	return error;
@@ -273,8 +279,8 @@ out:
 EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
 
 int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
+vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	     const char *name, const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -282,7 +288,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(&init_user_ns, dentry, &value, size);
+		error = cap_convert_nscap(mnt_userns, dentry, &value, size);
 		if (error < 0)
 			return error;
 		size = error;
@@ -290,8 +296,8 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 retry_deleg:
 	inode_lock(inode);
-	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
-	    &delegated_inode);
+	error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
+				      flags, &delegated_inode);
 	inode_unlock(inode);
 
 	if (delegated_inode) {
@@ -341,15 +347,16 @@ out_noalloc:
  * Returns the result of alloc, if failed, or the getxattr operation.
  */
 ssize_t
-vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
-		   size_t xattr_size, gfp_t flags)
+vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   const char *name, char **xattr_value, size_t xattr_size,
+		   gfp_t flags)
 {
 	const struct xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
 	char *value = *xattr_value;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_READ);
+	error = xattr_permission(mnt_userns, inode, name, MAY_READ);
 	if (error)
 		return error;
 
@@ -390,12 +397,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
 EXPORT_SYMBOL(__vfs_getxattr);
 
 ssize_t
-vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
+vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	     const char *name, void *value, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_READ);
+	error = xattr_permission(mnt_userns, inode, name, MAY_READ);
 	if (error)
 		return error;
 
@@ -441,7 +449,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size)
 EXPORT_SYMBOL_GPL(vfs_listxattr);
 
 int
-__vfs_removexattr(struct dentry *dentry, const char *name)
+__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  const char *name)
 {
 	struct inode *inode = d_inode(dentry);
 	const struct xattr_handler *handler;
@@ -451,8 +460,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
 		return PTR_ERR(handler);
 	if (!handler->set)
 		return -EOPNOTSUPP;
-	return handler->set(handler, &init_user_ns, dentry, inode, name, NULL,
-			    0, XATTR_REPLACE);
+	return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0,
+			    XATTR_REPLACE);
 }
 EXPORT_SYMBOL(__vfs_removexattr);
 
@@ -466,13 +475,14 @@ EXPORT_SYMBOL(__vfs_removexattr);
  *  a delegation was broken on, NULL if none.
  */
 int
-__vfs_removexattr_locked(struct dentry *dentry, const char *name,
-		struct inode **delegated_inode)
+__vfs_removexattr_locked(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, const char *name,
+			 struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+	error = xattr_permission(mnt_userns, inode, name, MAY_WRITE);
 	if (error)
 		return error;
 
@@ -484,7 +494,7 @@ __vfs_removexattr_locked(struct dentry *dentry, const char *name,
 	if (error)
 		goto out;
 
-	error = __vfs_removexattr(dentry, name);
+	error = __vfs_removexattr(mnt_userns, dentry, name);
 
 	if (!error) {
 		fsnotify_xattr(dentry);
@@ -497,7 +507,8 @@ out:
 EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);
 
 int
-vfs_removexattr(struct dentry *dentry, const char *name)
+vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		const char *name)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -505,7 +516,8 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 
 retry_deleg:
 	inode_lock(inode);
-	error = __vfs_removexattr_locked(dentry, name, &delegated_inode);
+	error = __vfs_removexattr_locked(mnt_userns, dentry,
+					 name, &delegated_inode);
 	inode_unlock(inode);
 
 	if (delegated_inode) {
@@ -522,8 +534,9 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
  * Extended attribute SET operations
  */
 static long
-setxattr(struct dentry *d, const char __user *name, const void __user *value,
-	 size_t size, int flags)
+setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+	 const char __user *name, const void __user *value, size_t size,
+	 int flags)
 {
 	int error;
 	void *kvalue = NULL;
@@ -550,11 +563,10 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		}
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_from_user(&init_user_ns, kvalue,
-						      size);
+			posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
 	}
 
-	error = vfs_setxattr(d, kname, kvalue, size, flags);
+	error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
 out:
 	kvfree(kvalue);
 
@@ -567,13 +579,15 @@ static int path_setxattr(const char __user *pathname,
 {
 	struct path path;
 	int error;
+
 retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(path.dentry, name, value, size, flags);
+		error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+				 value, size, flags);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -609,7 +623,9 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = setxattr(f.file->f_path.dentry, name, value, size, flags);
+		error = setxattr(file_mnt_user_ns(f.file),
+				 f.file->f_path.dentry, name,
+				 value, size, flags);
 		mnt_drop_write_file(f.file);
 	}
 	fdput(f);
@@ -620,8 +636,8 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
  * Extended attribute GET operations
  */
 static ssize_t
-getxattr(struct dentry *d, const char __user *name, void __user *value,
-	 size_t size)
+getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+	 const char __user *name, void __user *value, size_t size)
 {
 	ssize_t error;
 	void *kvalue = NULL;
@@ -641,12 +657,11 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 			return -ENOMEM;
 	}
 
-	error = vfs_getxattr(d, kname, kvalue, size);
+	error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_to_user(&init_user_ns, kvalue,
-						    error);
+			posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
@@ -670,7 +685,7 @@ retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
-	error = getxattr(path.dentry, name, value, size);
+	error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -700,7 +715,8 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 	if (!f.file)
 		return error;
 	audit_file(f.file);
-	error = getxattr(f.file->f_path.dentry, name, value, size);
+	error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+			 name, value, size);
 	fdput(f);
 	return error;
 }
@@ -784,7 +800,8 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
  * Extended attribute REMOVE operations
  */
 static long
-removexattr(struct dentry *d, const char __user *name)
+removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+	    const char __user *name)
 {
 	int error;
 	char kname[XATTR_NAME_MAX + 1];
@@ -795,7 +812,7 @@ removexattr(struct dentry *d, const char __user *name)
 	if (error < 0)
 		return error;
 
-	return vfs_removexattr(d, kname);
+	return vfs_removexattr(mnt_userns, d, kname);
 }
 
 static int path_removexattr(const char __user *pathname,
@@ -809,7 +826,7 @@ retry:
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(path.dentry, name);
+		error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -842,7 +859,8 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = removexattr(f.file->f_path.dentry, name);
+		error = removexattr(file_mnt_user_ns(f.file),
+				    f.file->f_path.dentry, name);
 		mnt_drop_write_file(f.file);
 	}
 	fdput(f);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 260c9bcb0edb..4c379d23ec6e 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
+#include <linux/user_namespace.h>
 #include <uapi/linux/xattr.h>
 
 struct inode;
@@ -49,18 +50,26 @@ struct xattr {
 };
 
 ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
-ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
+ssize_t vfs_getxattr(struct user_namespace *, struct dentry *, const char *,
+		     void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
-int __vfs_setxattr(struct dentry *, struct inode *, const char *, const void *, size_t, int);
-int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int);
-int __vfs_setxattr_locked(struct dentry *, const char *, const void *, size_t, int, struct inode **);
-int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
-int __vfs_removexattr(struct dentry *, const char *);
-int __vfs_removexattr_locked(struct dentry *, const char *, struct inode **);
-int vfs_removexattr(struct dentry *, const char *);
+int __vfs_setxattr(struct user_namespace *, struct dentry *, struct inode *,
+		   const char *, const void *, size_t, int);
+int __vfs_setxattr_noperm(struct user_namespace *, struct dentry *,
+			  const char *, const void *, size_t, int);
+int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
+			  const char *, const void *, size_t, int,
+			  struct inode **);
+int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
+		 const void *, size_t, int);
+int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
+int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
+			     const char *, struct inode **);
+int vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
 
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
+ssize_t vfs_getxattr_alloc(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
 
 int xattr_supported_namespace(struct inode *inode, const char *prefix);
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index f919ebd042fd..16f184bc48de 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -324,8 +324,8 @@ static int aa_xattrs_match(const struct linux_binprm *bprm,
 	d = bprm->file->f_path.dentry;
 
 	for (i = 0; i < profile->xattr_count; i++) {
-		size = vfs_getxattr_alloc(d, profile->xattrs[i], &value,
-					  value_size, GFP_KERNEL);
+		size = vfs_getxattr_alloc(&init_user_ns, d, profile->xattrs[i],
+					  &value, value_size, GFP_KERNEL);
 		if (size >= 0) {
 			u32 perm;
 
diff --git a/security/commoncap.c b/security/commoncap.c
index c3fd9b86ea9a..745dc1f2c97f 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -313,7 +313,7 @@ int cap_inode_killpriv(struct dentry *dentry)
 {
 	int error;
 
-	error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
+	error = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_CAPS);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
@@ -386,8 +386,8 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		return -EINVAL;
 
 	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
-				 &tmpbuf, size, GFP_NOFS);
+	ret = (int)vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_CAPS,
+				      &tmpbuf, size, GFP_NOFS);
 	dput(dentry);
 
 	if (ret < 0)
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 168c3b78ac47..f720f78cbbb1 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -222,7 +222,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 				ima_present = true;
 			continue;
 		}
-		size = vfs_getxattr_alloc(dentry, xattr->name,
+		size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name,
 					  &xattr_value, xattr_size, GFP_NOFS);
 		if (size == -ENOMEM) {
 			error = -ENOMEM;
@@ -275,8 +275,8 @@ static int evm_is_immutable(struct dentry *dentry, struct inode *inode)
 		return 1;
 
 	/* Do this the hard way */
-	rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
-				GFP_NOFS);
+	rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+				(char **)&xattr_data, 0, GFP_NOFS);
 	if (rc <= 0) {
 		if (rc == -ENODATA)
 			return 0;
@@ -319,11 +319,12 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name,
 			   xattr_value_len, &data);
 	if (rc == 0) {
 		data.hdr.xattr.sha1.type = EVM_XATTR_HMAC;
-		rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_EVM,
+		rc = __vfs_setxattr_noperm(&init_user_ns, dentry,
+					   XATTR_NAME_EVM,
 					   &data.hdr.xattr.data[1],
 					   SHA1_DIGEST_SIZE + 1, 0);
 	} else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) {
-		rc = __vfs_removexattr(dentry, XATTR_NAME_EVM);
+		rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM);
 	}
 	return rc;
 }
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 76d19146d74b..0de367aaa2d3 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -146,8 +146,8 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 	/* if status is not PASS, try to check again - against -ENOMEM */
 
 	/* first need to know the sig type */
-	rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
-				GFP_NOFS);
+	rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+				(char **)&xattr_data, 0, GFP_NOFS);
 	if (rc <= 0) {
 		evm_status = INTEGRITY_FAIL;
 		if (rc == -ENODATA) {
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 8361941ee0a1..70b643c41c6b 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -94,7 +94,7 @@ static int ima_fix_xattr(struct dentry *dentry,
 		iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
 		iint->ima_hash->xattr.ng.algo = algo;
 	}
-	rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
+	rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA,
 				   &iint->ima_hash->xattr.data[offset],
 				   (sizeof(iint->ima_hash->xattr) - offset) +
 				   iint->ima_hash->length, 0);
@@ -215,8 +215,8 @@ int ima_read_xattr(struct dentry *dentry,
 {
 	ssize_t ret;
 
-	ret = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value,
-				 0, GFP_NOFS);
+	ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA,
+				 (char **)xattr_value, 0, GFP_NOFS);
 	if (ret == -EOPNOTSUPP)
 		ret = 0;
 	return ret;
@@ -520,7 +520,7 @@ void ima_inode_post_setattr(struct dentry *dentry)
 
 	action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
 	if (!action)
-		__vfs_removexattr(dentry, XATTR_NAME_IMA);
+		__vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
 	iint = integrity_iint_find(inode);
 	if (iint) {
 		set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9d6d3da2caf2..2efedd7001b2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6526,7 +6526,8 @@ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen
  */
 static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 {
-	return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0);
+	return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX,
+				     ctx, ctxlen, 0);
 }
 
 static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index f69c3dd9a0c6..746e5743accc 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3425,7 +3425,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			 */
 			if (isp->smk_flags & SMK_INODE_CHANGED) {
 				isp->smk_flags &= ~SMK_INODE_CHANGED;
-				rc = __vfs_setxattr(dp, inode,
+				rc = __vfs_setxattr(&init_user_ns, dp, inode,
 					XATTR_NAME_SMACKTRANSMUTE,
 					TRANS_TRUE, TRANS_TRUE_SIZE,
 					0);
@@ -4597,12 +4597,14 @@ static int smack_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 
 static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
 {
-	return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx, ctxlen, 0);
+	return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx,
+				       ctxlen, 0);
 }
 
 static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 {
-	return __vfs_setxattr_noperm(dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0);
+	return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SMACK,
+				     ctx, ctxlen, 0);
 }
 
 static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
-- 
cgit v1.2.3


From 71bc356f93a1c589fad13f7487258f89c417976e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:29 +0100
Subject: commoncap: handle idmapped mounts

When interacting with user namespace and non-user namespace aware
filesystem capabilities the vfs will perform various security checks to
determine whether or not the filesystem capabilities can be used by the
caller, whether they need to be removed and so on. The main
infrastructure for this resides in the capability codepaths but they are
called through the LSM security infrastructure even though they are not
technically an LSM or optional. This extends the existing security hooks
security_inode_removexattr(), security_inode_killpriv(),
security_inode_getsecurity() to pass down the mount's user namespace and
makes them aware of idmapped mounts.

In order to actually get filesystem capabilities from disk the
capability infrastructure exposes the get_vfs_caps_from_disk() helper.
For user namespace aware filesystem capabilities a root uid is stored
alongside the capabilities.

In order to determine whether the caller can make use of the filesystem
capability or whether it needs to be ignored it is translated according
to the superblock's user namespace. If it can be translated to uid 0
according to that id mapping the caller can use the filesystem
capabilities stored on disk. If we are accessing the inode that holds
the filesystem capabilities through an idmapped mount we map the root
uid according to the mount's user namespace. Afterwards the checks are
identical to non-idmapped mounts: reading filesystem caps from disk
enforces that the root uid associated with the filesystem capability
must have a mapping in the superblock's user namespace and that the
caller is either in the same user namespace or is a descendant of the
superblock's user namespace. For filesystems that are mountable inside
user namespace the caller can just mount the filesystem and won't
usually need to idmap it. If they do want to idmap it they can create an
idmapped mount and mark it with a user namespace they created and which
is thus a descendant of s_user_ns. For filesystems that are not
mountable inside user namespaces the descendant rule is trivially true
because the s_user_ns will be the initial user namespace.

If the initial user namespace is passed nothing changes so non-idmapped
mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-11-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                     |  2 +-
 fs/xattr.c                    | 18 ++++++++-----
 include/linux/capability.h    |  4 ++-
 include/linux/lsm_hook_defs.h | 15 ++++++-----
 include/linux/lsm_hooks.h     |  1 +
 include/linux/security.h      | 54 +++++++++++++++++++++++--------------
 kernel/auditsc.c              |  5 ++--
 security/commoncap.c          | 62 ++++++++++++++++++++++++++++++++++---------
 security/security.c           | 25 ++++++++++-------
 security/selinux/hooks.c      | 20 ++++++++------
 security/smack/smack_lsm.c    | 14 +++++-----
 11 files changed, 146 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index f4543d2abdfb..c2d3bc0869d4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -143,7 +143,7 @@ kill_priv:
 	if (ia_valid & ATTR_KILL_PRIV) {
 		int error;
 
-		error = security_inode_killpriv(dentry);
+		error = security_inode_killpriv(mnt_userns, dentry);
 		if (error)
 			return error;
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index a49541713b11..79c439d31eaa 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -262,7 +262,8 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	error = security_inode_setxattr(dentry, name, value, size, flags);
+	error = security_inode_setxattr(mnt_userns, dentry, name, value, size,
+					flags);
 	if (error)
 		goto out;
 
@@ -313,18 +314,20 @@ retry_deleg:
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
 static ssize_t
-xattr_getsecurity(struct inode *inode, const char *name, void *value,
-			size_t size)
+xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode,
+		  const char *name, void *value, size_t size)
 {
 	void *buffer = NULL;
 	ssize_t len;
 
 	if (!value || !size) {
-		len = security_inode_getsecurity(inode, name, &buffer, false);
+		len = security_inode_getsecurity(mnt_userns, inode, name,
+						 &buffer, false);
 		goto out_noalloc;
 	}
 
-	len = security_inode_getsecurity(inode, name, &buffer, true);
+	len = security_inode_getsecurity(mnt_userns, inode, name, &buffer,
+					 true);
 	if (len < 0)
 		return len;
 	if (size < len) {
@@ -414,7 +417,8 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		int ret = xattr_getsecurity(inode, suffix, value, size);
+		int ret = xattr_getsecurity(mnt_userns, inode, suffix, value,
+					    size);
 		/*
 		 * Only overwrite the return value if a security module
 		 * is actually active.
@@ -486,7 +490,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns,
 	if (error)
 		return error;
 
-	error = security_inode_removexattr(dentry, name);
+	error = security_inode_removexattr(mnt_userns, dentry, name);
 	if (error)
 		goto out;
 
diff --git a/include/linux/capability.h b/include/linux/capability.h
index da21ef118b04..65efb74c3585 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -271,7 +271,9 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 }
 
 /* audit system wants to get cap info from files as well */
-extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+			   const struct dentry *dentry,
+			   struct cpu_vfs_cap_data *cpu_caps);
 
 int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
 		      const void **ivalue, size_t size);
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 7aaa753b8608..df4cdad6681e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -133,17 +133,20 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
 LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
 LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
 LSM_HOOK(int, 0, inode_getattr, const struct path *path)
-LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
-	 const void *value, size_t size, int flags)
+LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns,
+	 struct dentry *dentry, const char *name, const void *value,
+	 size_t size, int flags)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
 	 const char *name, const void *value, size_t size, int flags)
 LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
 LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
-LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
+LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns,
+	 struct dentry *dentry, const char *name)
 LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
-LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
-LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
-	 const char *name, void **buffer, bool alloc)
+LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns,
+	 struct dentry *dentry)
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns,
+	 struct inode *inode, const char *name, void **buffer, bool alloc)
 LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
 	 const char *name, const void *value, size_t size, int flags)
 LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a19adef1f088..98a5e263aabb 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -444,6 +444,7 @@
  * @inode_killpriv:
  *	The setuid bit is being removed.  Remove similar security labels.
  *	Called with the dentry->d_inode->i_mutex held.
+ *	@mnt_userns: user namespace of the mount
  *	@dentry is the dentry being changed.
  *	Return 0 on success.  If error is returned, then the operation
  *	causing setuid bit removal is failed.
diff --git a/include/linux/security.h b/include/linux/security.h
index c35ea0ffccd9..4e4f6c31e413 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -145,13 +145,16 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
 extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
-extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags);
-extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
-extern int cap_inode_need_killpriv(struct dentry *dentry);
-extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_inode_getsecurity(struct inode *inode, const char *name,
-				 void **buffer, bool alloc);
+int cap_inode_setxattr(struct dentry *dentry, const char *name,
+		       const void *value, size_t size, int flags);
+int cap_inode_removexattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name);
+int cap_inode_need_killpriv(struct dentry *dentry);
+int cap_inode_killpriv(struct user_namespace *mnt_userns,
+		       struct dentry *dentry);
+int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+			  struct inode *inode, const char *name, void **buffer,
+			  bool alloc);
 extern int cap_mmap_addr(unsigned long addr);
 extern int cap_mmap_file(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags);
@@ -345,16 +348,21 @@ int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 int security_inode_permission(struct inode *inode, int mask);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(const struct path *path);
-int security_inode_setxattr(struct dentry *dentry, const char *name,
+int security_inode_setxattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, const char *name,
 			    const void *value, size_t size, int flags);
 void security_inode_post_setxattr(struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags);
 int security_inode_getxattr(struct dentry *dentry, const char *name);
 int security_inode_listxattr(struct dentry *dentry);
-int security_inode_removexattr(struct dentry *dentry, const char *name);
+int security_inode_removexattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, const char *name);
 int security_inode_need_killpriv(struct dentry *dentry);
-int security_inode_killpriv(struct dentry *dentry);
-int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc);
+int security_inode_killpriv(struct user_namespace *mnt_userns,
+			    struct dentry *dentry);
+int security_inode_getsecurity(struct user_namespace *mnt_userns,
+			       struct inode *inode, const char *name,
+			       void **buffer, bool alloc);
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
 void security_inode_getsecid(struct inode *inode, u32 *secid);
@@ -831,8 +839,9 @@ static inline int security_inode_getattr(const struct path *path)
 	return 0;
 }
 
-static inline int security_inode_setxattr(struct dentry *dentry,
-		const char *name, const void *value, size_t size, int flags)
+static inline int security_inode_setxattr(struct user_namespace *mnt_userns,
+		struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
 {
 	return cap_inode_setxattr(dentry, name, value, size, flags);
 }
@@ -852,10 +861,11 @@ static inline int security_inode_listxattr(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_inode_removexattr(struct dentry *dentry,
-			const char *name)
+static inline int security_inode_removexattr(struct user_namespace *mnt_userns,
+					     struct dentry *dentry,
+					     const char *name)
 {
-	return cap_inode_removexattr(dentry, name);
+	return cap_inode_removexattr(mnt_userns, dentry, name);
 }
 
 static inline int security_inode_need_killpriv(struct dentry *dentry)
@@ -863,14 +873,18 @@ static inline int security_inode_need_killpriv(struct dentry *dentry)
 	return cap_inode_need_killpriv(dentry);
 }
 
-static inline int security_inode_killpriv(struct dentry *dentry)
+static inline int security_inode_killpriv(struct user_namespace *mnt_userns,
+					  struct dentry *dentry)
 {
-	return cap_inode_killpriv(dentry);
+	return cap_inode_killpriv(mnt_userns, dentry);
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+static inline int security_inode_getsecurity(struct user_namespace *mnt_userns,
+					     struct inode *inode,
+					     const char *name, void **buffer,
+					     bool alloc)
 {
-	return cap_inode_getsecurity(inode, name, buffer, alloc);
+	return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
 }
 
 static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce8c9e2279ba..fdfdd71405ff 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1930,7 +1930,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
 	if (!dentry)
 		return 0;
 
-	rc = get_vfs_caps_from_disk(dentry, &caps);
+	rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
 	if (rc)
 		return rc;
 
@@ -2481,7 +2481,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
 
-	get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	get_vfs_caps_from_disk(&init_user_ns,
+			       bprm->file->f_path.dentry, &vcaps);
 
 	ax->fcap.permitted = vcaps.permitted;
 	ax->fcap.inheritable = vcaps.inheritable;
diff --git a/security/commoncap.c b/security/commoncap.c
index 745dc1f2c97f..234b074c2c58 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -303,17 +303,25 @@ int cap_inode_need_killpriv(struct dentry *dentry)
 
 /**
  * cap_inode_killpriv - Erase the security markings on an inode
- * @dentry: The inode/dentry to alter
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	The inode/dentry to alter
  *
  * Erase the privilege-enhancing security markings on an inode.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Returns 0 if successful, -ve on error.
  */
-int cap_inode_killpriv(struct dentry *dentry)
+int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
 {
 	int error;
 
-	error = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_CAPS);
+	error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
@@ -366,7 +374,8 @@ static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
  * by the integrity subsystem, which really wants the unconverted values -
  * so that's good.
  */
-int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+			  struct inode *inode, const char *name, void **buffer,
 			  bool alloc)
 {
 	int size, ret;
@@ -386,7 +395,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		return -EINVAL;
 
 	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int)vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_CAPS,
+	ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
 				      &tmpbuf, size, GFP_NOFS);
 	dput(dentry);
 
@@ -412,6 +421,9 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 	root = le32_to_cpu(nscap->rootid);
 	kroot = make_kuid(fs_ns, root);
 
+	/* If this is an idmapped mount shift the kuid. */
+	kroot = kuid_into_mnt(mnt_userns, kroot);
+
 	/* If the root kuid maps to a valid uid in current ns, then return
 	 * this as a nscap. */
 	mappedroot = from_kuid(current_user_ns(), kroot);
@@ -595,10 +607,24 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 	return *effective ? ret : 0;
 }
 
-/*
+/**
+ * get_vfs_caps_from_disk - retrieve vfs caps from disk
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	dentry from which @inode is retrieved
+ * @cpu_caps:	vfs capabilities
+ *
  * Extract the on-exec-apply capability sets for an executable file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+			   const struct dentry *dentry,
+			   struct cpu_vfs_cap_data *cpu_caps)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	__u32 magic_etc;
@@ -654,6 +680,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	/* Limit the caps to the mounter of the filesystem
 	 * or the more limited uid specified in the xattr.
 	 */
+	rootkuid = kuid_into_mnt(mnt_userns, rootkuid);
 	if (!rootid_owns_currentns(rootkuid))
 		return -ENODATA;
 
@@ -699,7 +726,8 @@ static int get_file_caps(struct linux_binprm *bprm, struct file *file,
 	if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
 		return 0;
 
-	rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
+	rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
+				    file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
 			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
@@ -964,16 +992,25 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 
 /**
  * cap_inode_removexattr - Determine whether an xattr may be removed
- * @dentry: The inode/dentry being altered
- * @name: The name of the xattr to be changed
+ *
+ * @mnt_userns:	User namespace of the mount the inode was found from
+ * @dentry:	The inode/dentry being altered
+ * @name:	The name of the xattr to be changed
  *
  * Determine whether an xattr may be removed from an inode, returning 0 if
  * permission is granted, -ve if denied.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * This is used to make sure security xattrs don't get removed by those who
  * aren't privileged to remove them.
  */
-int cap_inode_removexattr(struct dentry *dentry, const char *name)
+int cap_inode_removexattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name)
 {
 	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
 
@@ -987,8 +1024,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 		struct inode *inode = d_backing_inode(dentry);
 		if (!inode)
 			return -EINVAL;
-		if (!capable_wrt_inode_uidgid(&init_user_ns, inode,
-					      CAP_SETFCAP))
+		if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
diff --git a/security/security.c b/security/security.c
index 7b09cfbae94f..698a9f17bad7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1280,7 +1280,8 @@ int security_inode_getattr(const struct path *path)
 	return call_int_hook(inode_getattr, 0, path);
 }
 
-int security_inode_setxattr(struct dentry *dentry, const char *name,
+int security_inode_setxattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, const char *name,
 			    const void *value, size_t size, int flags)
 {
 	int ret;
@@ -1291,8 +1292,8 @@ int security_inode_setxattr(struct dentry *dentry, const char *name,
 	 * SELinux and Smack integrate the cap call,
 	 * so assume that all LSMs supplying this call do so.
 	 */
-	ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
-				flags);
+	ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value,
+			    size, flags);
 
 	if (ret == 1)
 		ret = cap_inode_setxattr(dentry, name, value, size, flags);
@@ -1327,7 +1328,8 @@ int security_inode_listxattr(struct dentry *dentry)
 	return call_int_hook(inode_listxattr, 0, dentry);
 }
 
-int security_inode_removexattr(struct dentry *dentry, const char *name)
+int security_inode_removexattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, const char *name)
 {
 	int ret;
 
@@ -1337,9 +1339,9 @@ int security_inode_removexattr(struct dentry *dentry, const char *name)
 	 * SELinux and Smack integrate the cap call,
 	 * so assume that all LSMs supplying this call do so.
 	 */
-	ret = call_int_hook(inode_removexattr, 1, dentry, name);
+	ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name);
 	if (ret == 1)
-		ret = cap_inode_removexattr(dentry, name);
+		ret = cap_inode_removexattr(mnt_userns, dentry, name);
 	if (ret)
 		return ret;
 	ret = ima_inode_removexattr(dentry, name);
@@ -1353,12 +1355,15 @@ int security_inode_need_killpriv(struct dentry *dentry)
 	return call_int_hook(inode_need_killpriv, 0, dentry);
 }
 
-int security_inode_killpriv(struct dentry *dentry)
+int security_inode_killpriv(struct user_namespace *mnt_userns,
+			    struct dentry *dentry)
 {
-	return call_int_hook(inode_killpriv, 0, dentry);
+	return call_int_hook(inode_killpriv, 0, mnt_userns, dentry);
 }
 
-int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+int security_inode_getsecurity(struct user_namespace *mnt_userns,
+			       struct inode *inode, const char *name,
+			       void **buffer, bool alloc)
 {
 	struct security_hook_list *hp;
 	int rc;
@@ -1369,7 +1374,7 @@ int security_inode_getsecurity(struct inode *inode, const char *name, void **buf
 	 * Only one module will provide an attribute with a given name.
 	 */
 	hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
-		rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
+		rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
 		if (rc != LSM_RET_DEFAULT(inode_getsecurity))
 			return rc;
 	}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2efedd7001b2..9719dd124221 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3119,7 +3119,8 @@ static bool has_cap_mac_admin(bool audit)
 	return true;
 }
 
-static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
+static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
+				  struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags)
 {
 	struct inode *inode = d_backing_inode(dentry);
@@ -3140,13 +3141,13 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 	}
 
 	if (!selinux_initialized(&selinux_state))
-		return (inode_owner_or_capable(&init_user_ns, inode) ? 0 : -EPERM);
+		return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
 
 	sbsec = inode->i_sb->s_security;
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	ad.type = LSM_AUDIT_DATA_DENTRY;
@@ -3267,10 +3268,11 @@ static int selinux_inode_listxattr(struct dentry *dentry)
 	return dentry_has_perm(cred, dentry, FILE__GETATTR);
 }
 
-static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
+static int selinux_inode_removexattr(struct user_namespace *mnt_userns,
+				     struct dentry *dentry, const char *name)
 {
 	if (strcmp(name, XATTR_NAME_SELINUX)) {
-		int rc = cap_inode_removexattr(dentry, name);
+		int rc = cap_inode_removexattr(mnt_userns, dentry, name);
 		if (rc)
 			return rc;
 
@@ -3336,7 +3338,9 @@ static int selinux_path_notify(const struct path *path, u64 mask,
  *
  * Permission check is handled by selinux_inode_getxattr hook.
  */
-static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+static int selinux_inode_getsecurity(struct user_namespace *mnt_userns,
+				     struct inode *inode, const char *name,
+				     void **buffer, bool alloc)
 {
 	u32 size;
 	int error;
@@ -6533,8 +6537,8 @@ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 {
 	int len = 0;
-	len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
-						ctx, true);
+	len = selinux_inode_getsecurity(&init_user_ns, inode,
+					XATTR_SELINUX_SUFFIX, ctx, true);
 	if (len < 0)
 		return len;
 	*ctxlen = len;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 746e5743accc..12a45e61c1a5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1240,7 +1240,8 @@ static int smack_inode_getattr(const struct path *path)
  *
  * Returns 0 if access is permitted, an error code otherwise
  */
-static int smack_inode_setxattr(struct dentry *dentry, const char *name,
+static int smack_inode_setxattr(struct user_namespace *mnt_userns,
+				struct dentry *dentry, const char *name,
 				const void *value, size_t size, int flags)
 {
 	struct smk_audit_info ad;
@@ -1362,7 +1363,8 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  *
  * Returns 0 if access is permitted, an error code otherwise
  */
-static int smack_inode_removexattr(struct dentry *dentry, const char *name)
+static int smack_inode_removexattr(struct user_namespace *mnt_userns,
+				   struct dentry *dentry, const char *name)
 {
 	struct inode_smack *isp;
 	struct smk_audit_info ad;
@@ -1377,7 +1379,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 		if (!smack_privileged(CAP_MAC_ADMIN))
 			rc = -EPERM;
 	} else
-		rc = cap_inode_removexattr(dentry, name);
+		rc = cap_inode_removexattr(mnt_userns, dentry, name);
 
 	if (rc != 0)
 		return rc;
@@ -1420,9 +1422,9 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
  *
  * Returns the size of the attribute or an error code
  */
-static int smack_inode_getsecurity(struct inode *inode,
-				   const char *name, void **buffer,
-				   bool alloc)
+static int smack_inode_getsecurity(struct user_namespace *mnt_userns,
+				   struct inode *inode, const char *name,
+				   void **buffer, bool alloc)
 {
 	struct socket_smack *ssp;
 	struct socket *sock;
-- 
cgit v1.2.3


From 0d56a4518d5eaf595a24ab2202e171330bb2ed72 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:30 +0100
Subject: stat: handle idmapped mounts

The generic_fillattr() helper fills in the basic attributes associated
with an inode. Enable it to handle idmapped mounts. If the inode is
accessed through an idmapped mount map it into the mount's user
namespace before we store the uid and gid. If the initial user namespace
is passed nothing changes so non-idmapped mounts will see identical
behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-12-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/9p/vfs_inode.c      |  4 ++--
 fs/9p/vfs_inode_dotl.c |  4 ++--
 fs/afs/inode.c         |  2 +-
 fs/btrfs/inode.c       |  2 +-
 fs/ceph/inode.c        |  2 +-
 fs/cifs/inode.c        |  2 +-
 fs/coda/inode.c        |  2 +-
 fs/ecryptfs/inode.c    |  4 ++--
 fs/erofs/inode.c       |  2 +-
 fs/exfat/file.c        |  2 +-
 fs/ext2/inode.c        |  2 +-
 fs/ext4/inode.c        |  2 +-
 fs/f2fs/file.c         |  2 +-
 fs/fat/file.c          |  2 +-
 fs/fuse/dir.c          |  2 +-
 fs/gfs2/inode.c        |  2 +-
 fs/hfsplus/inode.c     |  2 +-
 fs/kernfs/inode.c      |  2 +-
 fs/libfs.c             |  4 ++--
 fs/minix/inode.c       |  2 +-
 fs/nfs/inode.c         |  2 +-
 fs/nfs/namespace.c     |  2 +-
 fs/ocfs2/file.c        |  2 +-
 fs/orangefs/inode.c    |  2 +-
 fs/proc/base.c         |  4 ++--
 fs/proc/generic.c      |  2 +-
 fs/proc/proc_net.c     |  2 +-
 fs/proc/proc_sysctl.c  |  2 +-
 fs/proc/root.c         |  2 +-
 fs/stat.c              | 20 ++++++++++++++------
 fs/sysv/itree.c        |  2 +-
 fs/ubifs/dir.c         |  2 +-
 fs/udf/symlink.c       |  2 +-
 fs/vboxsf/utils.c      |  2 +-
 include/linux/fs.h     |  2 +-
 mm/shmem.c             |  2 +-
 36 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9c3ff6e9ab82..c21b146c8d91 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1027,7 +1027,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		return 0;
 	}
 	fid = v9fs_fid_lookup(dentry);
@@ -1040,7 +1040,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 
 	p9stat_free(st);
 	kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 302553101fcb..984f28315d2a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -468,7 +468,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		return 0;
 	}
 	fid = v9fs_fid_lookup(dentry);
@@ -485,7 +485,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index b0d7b892090d..795ee5cb3817 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -745,7 +745,7 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 
 	do {
 		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 		if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
 		    stat->nlink > 0)
 			stat->nlink -= 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c18fb1a25af..a63faed171de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8842,7 +8842,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 
 	spin_lock(&BTRFS_I(inode)->lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 145e26a4ddbb..179a2bb88538 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2385,7 +2385,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 			return err;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->ino = ceph_present_inode(inode);
 
 	/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 27554f71f744..374abce7efaf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2408,7 +2408,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
 			return rc;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = cifs_sb->ctx->bsize;
 	stat->ino = CIFS_I(inode)->uniqueid;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b1c70e2b9b1e..4d113e191cb8 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -256,7 +256,7 @@ int coda_getattr(const struct path *path, struct kstat *stat,
 {
 	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
-		generic_fillattr(d_inode(path->dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return err;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b9ccc4085d46..385b5e8741c0 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -977,7 +977,7 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
 
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 						dentry->d_sb)->mount_crypt_stat;
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
 		char *target;
 		size_t targetsiz;
@@ -1005,7 +1005,7 @@ static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		stat->blocks = lower_stat.blocks;
 	}
 	return rc;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3e21c0e8adae..083818063ac6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -343,7 +343,7 @@ int erofs_getattr(const struct path *path, struct kstat *stat,
 	stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
 				  STATX_ATTR_IMMUTABLE);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index ace35aa8e64b..e9705b3295d3 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -273,7 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat,
 	struct inode *inode = d_backing_inode(path->dentry);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	exfat_truncate_atime(&stat->atime);
 	stat->result_mask |= STATX_BTIME;
 	stat->btime.tv_sec = ei->i_crtime.tv_sec;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9de813635d8d..3d8acafca8ce 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1660,7 +1660,7 @@ int ext2_getattr(const struct path *path, struct kstat *stat,
 			STATX_ATTR_IMMUTABLE |
 			STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24ea5851e90a..3a303d3f8423 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5571,7 +5571,7 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP |
 				  STATX_ATTR_VERITY);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6ccdfe0606d9..44cd0dbdbb5d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -820,7 +820,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP |
 				  STATX_ATTR_VERITY);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	/* we need to show initial sectors used for inline_data/dentries */
 	if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 805b501467e9..f7e04f533d31 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -398,7 +398,7 @@ int fat_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
 	if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 74fdb6a7ebb3..d2e318ed9b26 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1087,7 +1087,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
 		forget_all_cached_acls(inode);
 		err = fuse_do_getattr(inode, stat, file);
 	} else if (stat) {
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 		stat->mode = fi->orig_i_mode;
 		stat->ino = fi->orig_ino;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 728405d15a05..226b5b1dc1fa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2050,7 +2050,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (gfs2_holder_initialized(&gh))
 		gfs2_glock_dq_uninit(&gh);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index ffa137f8234e..642e067d8fe8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -286,7 +286,7 @@ int hfsplus_getattr(const struct path *path, struct kstat *stat,
 	stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
 				 STATX_ATTR_NODUMP;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 7e44052b42e1..032d3d7546d8 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -193,7 +193,7 @@ int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a73fe109403c..508e9ea8e6f3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -31,7 +31,7 @@ int simple_getattr(const struct path *path, struct kstat *stat,
 		   u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
 	return 0;
 }
@@ -1304,7 +1304,7 @@ static int empty_dir_getattr(const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 34f546404aa1..91c81d2fc90d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -658,7 +658,7 @@ int minix_getattr(const struct path *path, struct kstat *stat,
 	struct super_block *sb = path->dentry->d_sb;
 	struct inode *inode = d_inode(path->dentry);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	if (INODE_VERSION(inode) == MINIX_V1)
 		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
 	else
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 522aa10a1a3e..cab123ec1664 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -857,7 +857,7 @@ out_no_revalidate:
 	/* Only return attributes that were revalidated. */
 	stat->result_mask &= request_mask;
 out_no_update:
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	if (S_ISDIR(inode->i_mode))
 		stat->blksize = NFS_SERVER(inode)->dtsize;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2bcbe38afe2e..55fc711e368b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -213,7 +213,7 @@ nfs_namespace_getattr(const struct path *path, struct kstat *stat,
 {
 	if (NFS_FH(d_inode(path->dentry))->size != 0)
 		return nfs_getattr(path, stat, request_mask, query_flags);
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return 0;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cabf355b148f..a070d4c9b6ed 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1313,7 +1313,7 @@ int ocfs2_getattr(const struct path *path, struct kstat *stat,
 		goto bail;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	/*
 	 * If there is inline data in the inode, the inode will normally not
 	 * have data blocks allocated (it may have an external xattr block).
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 563fe9ab8eb2..b94032f77e61 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -903,7 +903,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 	ret = orangefs_inode_getattr(inode,
 	    request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
 	if (ret == 0) {
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 
 		/* override block size reported to stat */
 		if (!(request_mask & STATX_SIZE))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bb4e63a3684f..d45aa68c1f17 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1934,7 +1934,7 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 	struct task_struct *task;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	stat->uid = GLOBAL_ROOT_UID;
 	stat->gid = GLOBAL_ROOT_GID;
@@ -3803,7 +3803,7 @@ static int proc_task_getattr(const struct path *path, struct kstat *stat,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *p = get_proc_task(inode);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (p) {
 		stat->nlink += get_nr_threads(p);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6d4fabab8aa7..0db96a761149 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -145,7 +145,7 @@ static int proc_getattr(const struct path *path, struct kstat *stat,
 		}
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 18601042af99..4aef49ccf571 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -297,7 +297,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
 
 	net = get_proc_task_net(inode);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (net != NULL) {
 		stat->nlink = net->proc_net->nlink;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ec67dbc1f705..87c828348140 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -840,7 +840,7 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	if (table)
 		stat->mode = (stat->mode & S_IFMT) | table->mode;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5e444d4f9717..244e4b6f15ef 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -311,7 +311,7 @@ void __init proc_root_init(void)
 static int proc_root_getattr(const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	stat->nlink = proc_root.nlink + nr_processes();
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index dacecdda2e79..2c471c2fd766 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -26,21 +26,29 @@
 
 /**
  * generic_fillattr - Fill in the basic attributes from the inode struct
- * @inode: Inode to use as the source
- * @stat: Where to fill in the attributes
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	Inode to use as the source
+ * @stat:	Where to fill in the attributes
  *
  * Fill in the basic attributes in the kstat structure from data that's to be
  * found on the VFS inode structure.  This is the default if no getattr inode
  * operation is supplied.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before filling in the
+ * uid and gid filds. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-void generic_fillattr(struct inode *inode, struct kstat *stat)
+void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct kstat *stat)
 {
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
 	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
-	stat->uid = inode->i_uid;
-	stat->gid = inode->i_gid;
+	stat->uid = i_uid_into_mnt(mnt_userns, inode);
+	stat->gid = i_gid_into_mnt(mnt_userns, inode);
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
@@ -87,7 +95,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 		return inode->i_op->getattr(path, stat, request_mask,
 					    query_flags);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
 	return 0;
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index bcb67b0cabe7..83cffab6955f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -445,7 +445,7 @@ int sysv_getattr(const struct path *path, struct kstat *stat,
 		 u32 request_mask, unsigned int flags)
 {
 	struct super_block *s = path->dentry->d_sb;
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
 	stat->blksize = s->s_blocksize;
 	return 0;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 694e7714545b..a8881ed61620 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1589,7 +1589,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat,
 				STATX_ATTR_ENCRYPTED |
 				STATX_ATTR_IMMUTABLE);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = UBIFS_BLOCK_SIZE;
 	stat->size = ui->ui_size;
 
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c973db239604..54a44d1f023c 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -159,7 +159,7 @@ static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
 	struct inode *inode = d_backing_inode(dentry);
 	struct page *page;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	page = read_mapping_page(inode->i_mapping, 0, NULL);
 	if (IS_ERR(page))
 		return PTR_ERR(page);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 018057546067..d2cd1c99f48e 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -233,7 +233,7 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat,
 	if (err)
 		return err;
 
-	generic_fillattr(d_inode(dentry), kstat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), kstat);
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3ea1d7c3367..182641d8322f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3154,7 +3154,7 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len,
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
-extern void generic_fillattr(struct inode *, struct kstat *);
+void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --git a/mm/shmem.c b/mm/shmem.c
index 23b8e9c15a42..339d5530d3a9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1072,7 +1072,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
 		shmem_recalc_inode(inode);
 		spin_unlock_irq(&info->lock);
 	}
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (is_huge_enabled(sb_info))
 		stat->blksize = HPAGE_PMD_SIZE;
-- 
cgit v1.2.3


From ba73d98745be1c10dc3cce68e8d7b95012d07d05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:31 +0100
Subject: namei: handle idmapped mounts in may_*() helpers

The may_follow_link(), may_linkat(), may_lookup(), may_open(),
may_o_create(), may_create_in_sticky(), may_delete(), and may_create()
helpers determine whether the caller is privileged enough to perform the
associated operations. Let them handle idmapped mounts by mapping the
inode or fsids according to the mount's user namespace. Afterwards the
checks are identical to non-idmapped inodes. The patch takes care to
retrieve the mount's user namespace right before performing permission
checks and passing it down into the fileystem so the user namespace
can't change in between by someone idmapping a mount that is currently
not idmapped. If the initial user namespace is passed nothing changes so
non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-13-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/btrfs/ioctl.c   |   5 +-
 fs/init.c          |   2 +-
 fs/inode.c         |   2 +-
 fs/internal.h      |   2 +-
 fs/namei.c         | 148 +++++++++++++++++++++++++++++++++--------------------
 fs/xattr.c         |   2 +-
 include/linux/fs.h |  14 +++--
 7 files changed, 108 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1f763c60415b..56f53d692fa2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -927,8 +927,9 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
-	    IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+	if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+	    IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+	    IS_SWAPFILE(d_inode(victim)))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
diff --git a/fs/init.c b/fs/init.c
index 02723bea8499..891284f8a443 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -181,7 +181,7 @@ int __init init_link(const char *oldname, const char *newname)
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&old_path);
+	error = may_linkat(&init_user_ns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
diff --git a/fs/inode.c b/fs/inode.c
index 49b512592dcd..46116ef44c9f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1796,7 +1796,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 	/* Atime updates will likely cause i_uid and i_gid to be written
 	 * back improprely if their true value is unknown to the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(inode))
+	if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
 		return false;
 
 	if (IS_NOATIME(inode))
diff --git a/fs/internal.h b/fs/internal.h
index 77c50befbfbe..6c8a4eddc7e6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,7 +73,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
 long do_rmdir(int dfd, struct filename *name);
 long do_unlinkat(int dfd, struct filename *name);
-int may_linkat(struct path *link);
+int may_linkat(struct user_namespace *mnt_userns, struct path *link);
 int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
 		 struct filename *newname, unsigned int flags);
 
diff --git a/fs/namei.c b/fs/namei.c
index 04b001ddade3..93fa7d803fb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -506,7 +506,7 @@ int inode_permission(struct user_namespace *mnt_userns,
 		 * written back improperly if their true value is unknown
 		 * to the vfs.
 		 */
-		if (HAS_UNMAPPED_ID(inode))
+		if (HAS_UNMAPPED_ID(mnt_userns, inode))
 			return -EACCES;
 	}
 
@@ -1004,11 +1004,16 @@ int sysctl_protected_regular __read_mostly;
  */
 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
 {
+	struct user_namespace *mnt_userns;
+	kuid_t i_uid;
+
 	if (!sysctl_protected_symlinks)
 		return 0;
 
+	mnt_userns = mnt_user_ns(nd->path.mnt);
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
 	/* Allowed if owner and follower match. */
-	if (uid_eq(current_cred()->fsuid, inode->i_uid))
+	if (uid_eq(current_cred()->fsuid, i_uid))
 		return 0;
 
 	/* Allowed if parent directory not sticky and world-writable. */
@@ -1016,7 +1021,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
+	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
@@ -1029,6 +1034,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 
 /**
  * safe_hardlink_source - Check for safe hardlink conditions
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: the source inode to hardlink from
  *
  * Return false if at least one of the following conditions:
@@ -1039,7 +1045,8 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
  *
  * Otherwise returns true.
  */
-static bool safe_hardlink_source(struct inode *inode)
+static bool safe_hardlink_source(struct user_namespace *mnt_userns,
+				 struct inode *inode)
 {
 	umode_t mode = inode->i_mode;
 
@@ -1056,7 +1063,7 @@ static bool safe_hardlink_source(struct inode *inode)
 		return false;
 
 	/* Hardlinking to unreadable or unwritable sources is dangerous. */
-	if (inode_permission(&init_user_ns, inode, MAY_READ | MAY_WRITE))
+	if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
 		return false;
 
 	return true;
@@ -1064,6 +1071,7 @@ static bool safe_hardlink_source(struct inode *inode)
 
 /**
  * may_linkat - Check permissions for creating a hardlink
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @link: the source to hardlink from
  *
  * Block hardlink when all of:
@@ -1072,14 +1080,21 @@ static bool safe_hardlink_source(struct inode *inode)
  *  - hardlink source is unsafe (see safe_hardlink_source() above)
  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
  * Returns 0 if successful, -ve on error.
  */
-int may_linkat(struct path *link)
+int may_linkat(struct user_namespace *mnt_userns, struct path *link)
 {
 	struct inode *inode = link->dentry->d_inode;
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	if (!sysctl_protected_hardlinks)
@@ -1088,8 +1103,8 @@ int may_linkat(struct path *link)
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (safe_hardlink_source(inode) ||
-	    inode_owner_or_capable(&init_user_ns, inode))
+	if (safe_hardlink_source(mnt_userns, inode) ||
+	    inode_owner_or_capable(mnt_userns, inode))
 		return 0;
 
 	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@ -1100,6 +1115,7 @@ int may_linkat(struct path *link)
  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
  *			  should be allowed, or not, on files that already
  *			  exist.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dir_mode: mode bits of directory
  * @dir_uid: owner of directory
  * @inode: the inode of the file to open
@@ -1115,16 +1131,25 @@ int may_linkat(struct path *link)
  * the directory doesn't have to be world writable: being group writable will
  * be enough.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
  * Returns 0 if the open is allowed, -ve on error.
  */
-static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
-				struct inode * const inode)
+static int may_create_in_sticky(struct user_namespace *mnt_userns,
+				struct nameidata *nd, struct inode *const inode)
 {
+	umode_t dir_mode = nd->dir_mode;
+	kuid_t dir_uid = nd->dir_uid;
+
 	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
 	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
 	    likely(!(dir_mode & S_ISVTX)) ||
-	    uid_eq(inode->i_uid, dir_uid) ||
-	    uid_eq(current_fsuid(), inode->i_uid))
+	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
+	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
 		return 0;
 
 	if (likely(dir_mode & 0002) ||
@@ -1614,17 +1639,18 @@ static struct dentry *lookup_slow(const struct qstr *name,
 	return res;
 }
 
-static inline int may_lookup(struct nameidata *nd)
+static inline int may_lookup(struct user_namespace *mnt_userns,
+			     struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(&init_user_ns, nd->inode,
+		int err = inode_permission(mnt_userns, nd->inode,
 					   MAY_EXEC | MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
-	return inode_permission(&init_user_ns, nd->inode, MAY_EXEC);
+	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
@@ -2177,7 +2203,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		u64 hash_len;
 		int type;
 
-		err = may_lookup(nd);
+		err = may_lookup(&init_user_ns, nd);
 		if (err)
 			return err;
 
@@ -2225,7 +2251,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
-				nd->dir_uid = nd->inode->i_uid;
+				nd->dir_uid = i_uid_into_mnt(&init_user_ns, nd->inode);
 				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
@@ -2703,15 +2729,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 }
 EXPORT_SYMBOL(user_path_at_empty);
 
-int __check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+		   struct inode *inode)
 {
 	kuid_t fsuid = current_fsuid();
 
-	if (uid_eq(inode->i_uid, fsuid))
+	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
 		return 0;
-	if (uid_eq(dir->i_uid, fsuid))
+	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
 		return 0;
-	return !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FOWNER);
+	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
 }
 EXPORT_SYMBOL(__check_sticky);
 
@@ -2735,7 +2762,8 @@ EXPORT_SYMBOL(__check_sticky);
  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *victim, bool isdir)
 {
 	struct inode *inode = d_backing_inode(victim);
 	int error;
@@ -2747,19 +2775,21 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
 
-	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
+	if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
+	    HAS_UNMAPPED_ID(mnt_userns, inode))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -2784,7 +2814,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  4. We should have write and exec permissions on dir
  *  5. We can't do it if dir is immutable (done in permission())
  */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *child)
 {
 	struct user_namespace *s_user_ns;
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@ -2793,10 +2824,10 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
 	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
 		return -EOVERFLOW;
-	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2846,7 +2877,7 @@ EXPORT_SYMBOL(unlock_rename);
 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		bool want_excl)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	if (error)
 		return error;
 
@@ -2869,7 +2900,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode,
 		void *arg)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	if (error)
 		return error;
 
@@ -2891,7 +2922,8 @@ bool may_open_dev(const struct path *path)
 		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
 }
 
-static int may_open(const struct path *path, int acc_mode, int flag)
+static int may_open(struct user_namespace *mnt_userns, const struct path *path,
+		    int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2926,7 +2958,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(&init_user_ns, inode, MAY_OPEN | acc_mode);
+	error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
 	if (error)
 		return error;
 
@@ -2941,7 +2973,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
-	if (flag & O_NOATIME && !inode_owner_or_capable(&init_user_ns, inode))
+	if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	return 0;
@@ -2976,7 +3008,9 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
+static int may_o_create(struct user_namespace *mnt_userns,
+			const struct path *dir, struct dentry *dentry,
+			umode_t mode)
 {
 	struct user_namespace *s_user_ns;
 	int error = security_path_mknod(dir, dentry, mode, 0);
@@ -2984,11 +3018,11 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
 		return error;
 
 	s_user_ns = dir->dentry->d_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
 		return -EOVERFLOW;
 
-	error = inode_permission(&init_user_ns, dir->dentry->d_inode,
+	error = inode_permission(mnt_userns, dir->dentry->d_inode,
 				 MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
@@ -3121,7 +3155,8 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
 		if (likely(got_write))
-			create_error = may_o_create(&nd->path, dentry, mode);
+			create_error = may_o_create(&init_user_ns, &nd->path,
+						    dentry, mode);
 		else
 			create_error = -EROFS;
 	}
@@ -3282,7 +3317,7 @@ static int do_open(struct nameidata *nd,
 			return -EEXIST;
 		if (d_is_dir(nd->path.dentry))
 			return -EISDIR;
-		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
+		error = may_create_in_sticky(&init_user_ns, nd,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
 			return error;
@@ -3302,7 +3337,7 @@ static int do_open(struct nameidata *nd,
 			return error;
 		do_truncate = true;
 	}
-	error = may_open(&nd->path, acc_mode, open_flag);
+	error = may_open(&init_user_ns, &nd->path, acc_mode, open_flag);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
@@ -3377,7 +3412,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	path.dentry = child;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&path, 0, op->open_flag);
+	error = may_open(&init_user_ns, &path, 0, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
@@ -3584,7 +3619,7 @@ EXPORT_SYMBOL(user_path_create);
 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 
 	if (error)
 		return error;
@@ -3685,7 +3720,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	unsigned max_links = dir->i_sb->s_max_links;
 
 	if (error)
@@ -3746,7 +3781,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	int error = may_delete(dir, dentry, 1);
+	int error = may_delete(&init_user_ns, dir, dentry, 1);
 
 	if (error)
 		return error;
@@ -3868,7 +3903,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
-	int error = may_delete(dir, dentry, 0);
+	int error = may_delete(&init_user_ns, dir, dentry, 0);
 
 	if (error)
 		return error;
@@ -4000,7 +4035,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 
 	if (error)
 		return error;
@@ -4089,7 +4124,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry);
+	error = may_create(&init_user_ns, dir, new_dentry);
 	if (error)
 		return error;
 
@@ -4106,7 +4141,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 * be writen back improperly if their true value is unknown to
 	 * the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(inode))
+	if (HAS_UNMAPPED_ID(&init_user_ns, inode))
 		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
@@ -4188,7 +4223,7 @@ retry:
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&old_path);
+	error = may_linkat(&init_user_ns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
@@ -4281,6 +4316,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode **delegated_inode, unsigned int flags)
 {
 	int error;
+	struct user_namespace *mnt_userns = &init_user_ns;
 	bool is_dir = d_is_dir(old_dentry);
 	struct inode *source = old_dentry->d_inode;
 	struct inode *target = new_dentry->d_inode;
@@ -4291,19 +4327,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (source == target)
 		return 0;
 
-	error = may_delete(old_dir, old_dentry, is_dir);
+	error = may_delete(mnt_userns, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(new_dir, new_dentry);
+		error = may_create(mnt_userns, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(new_dir, new_dentry, is_dir);
+			error = may_delete(mnt_userns, new_dir, new_dentry, is_dir);
 		else
-			error = may_delete(new_dir, new_dentry, new_is_dir);
+			error = may_delete(mnt_userns, new_dir, new_dentry, new_is_dir);
 	}
 	if (error)
 		return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index 79c439d31eaa..b3444e06cded 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -98,7 +98,7 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
 		 * to be writen back improperly if their true value is
 		 * unknown to the vfs.
 		 */
-		if (HAS_UNMAPPED_ID(inode))
+		if (HAS_UNMAPPED_ID(mnt_userns, inode))
 			return -EPERM;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 182641d8322f..a27884af7222 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2083,9 +2083,11 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
 
-static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
+				   struct inode *inode)
 {
-	return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+	return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	       !gid_valid(i_gid_into_mnt(mnt_userns, inode));
 }
 
 static inline enum rw_hint file_write_hint(struct file *file)
@@ -2823,7 +2825,8 @@ static inline int path_permission(const struct path *path, int mask)
 	return inode_permission(mnt_user_ns(path->mnt),
 				d_inode(path->dentry), mask);
 }
-extern int __check_sticky(struct inode *dir, struct inode *inode);
+int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+		   struct inode *inode);
 
 static inline bool execute_ok(struct inode *inode)
 {
@@ -3442,12 +3445,13 @@ static inline bool is_sxid(umode_t mode)
 	return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
 }
 
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+static inline int check_sticky(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct inode *inode)
 {
 	if (!(dir->i_mode & S_ISVTX))
 		return 0;
 
-	return __check_sticky(dir, inode);
+	return __check_sticky(mnt_userns, dir, inode);
 }
 
 static inline void inode_has_no_xattr(struct inode *inode)
-- 
cgit v1.2.3


From 9fe61450972d3900bffb1dc26a17ebb9cdd92db2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:32 +0100
Subject: namei: introduce struct renamedata

In order to handle idmapped mounts we will extend the vfs rename helper
to take two new arguments in follow up patches. Since this operations
already takes a bunch of arguments add a simple struct renamedata and
make the current helper use it before we extend it.

Link: https://lore.kernel.org/r/20210121131959.646623-14-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/cachefiles/namei.c    |  9 +++++++--
 fs/ecryptfs/inode.c      | 10 +++++++---
 fs/namei.c               | 21 +++++++++++++++------
 fs/nfsd/vfs.c            |  8 +++++++-
 fs/overlayfs/overlayfs.h |  9 ++++++++-
 include/linux/fs.h       | 12 +++++++++++-
 6 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ecc8ecbbfa5a..7b987de0babe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -412,9 +412,14 @@ try_again:
 	if (ret < 0) {
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
+		struct renamedata rd = {
+			.old_dir	= d_inode(dir),
+			.old_dentry	= rep,
+			.new_dir	= d_inode(cache->graveyard),
+			.new_dentry	= grave,
+		};
 		trace_cachefiles_rename(object, rep, grave, why);
-		ret = vfs_rename(d_inode(dir), rep,
-				 d_inode(cache->graveyard), grave, NULL, 0);
+		ret = vfs_rename(&rd);
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 385b5e8741c0..ff48abb09679 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -590,6 +590,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap;
 	struct inode *target_inode;
+	struct renamedata rd = {};
 
 	if (flags)
 		return -EINVAL;
@@ -619,9 +620,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		rc = -ENOTEMPTY;
 		goto out_lock;
 	}
-	rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
-			d_inode(lower_new_dir_dentry), lower_new_dentry,
-			NULL, 0);
+
+	rd.old_dir	= d_inode(lower_old_dir_dentry);
+	rd.old_dentry	= lower_old_dentry;
+	rd.new_dir	= d_inode(lower_new_dir_dentry);
+	rd.new_dentry	= lower_new_dentry;
+	rc = vfs_rename(&rd);
 	if (rc)
 		goto out_lock;
 	if (target_inode)
diff --git a/fs/namei.c b/fs/namei.c
index 93fa7d803fb2..38ab51881247 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4311,12 +4311,15 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   ->i_mutex on parents, which works but leads to some truly excessive
  *	   locking].
  */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-	       struct inode *new_dir, struct dentry *new_dentry,
-	       struct inode **delegated_inode, unsigned int flags)
+int vfs_rename(struct renamedata *rd)
 {
 	int error;
 	struct user_namespace *mnt_userns = &init_user_ns;
+	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+	struct dentry *old_dentry = rd->old_dentry;
+	struct dentry *new_dentry = rd->new_dentry;
+	struct inode **delegated_inode = rd->delegated_inode;
+	unsigned int flags = rd->flags;
 	bool is_dir = d_is_dir(old_dentry);
 	struct inode *source = old_dentry->d_inode;
 	struct inode *target = new_dentry->d_inode;
@@ -4442,6 +4445,7 @@ EXPORT_SYMBOL(vfs_rename);
 int do_renameat2(int olddfd, struct filename *from, int newdfd,
 		 struct filename *to, unsigned int flags)
 {
+	struct renamedata rd;
 	struct dentry *old_dentry, *new_dentry;
 	struct dentry *trap;
 	struct path old_path, new_path;
@@ -4545,9 +4549,14 @@ retry_deleg:
 				     &new_path, new_dentry, flags);
 	if (error)
 		goto exit5;
-	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
-			   new_path.dentry->d_inode, new_dentry,
-			   &delegated_inode, flags);
+
+	rd.old_dir	   = old_path.dentry->d_inode;
+	rd.old_dentry	   = old_dentry;
+	rd.new_dir	   = new_path.dentry->d_inode;
+	rd.new_dentry	   = new_dentry;
+	rd.delegated_inode = &delegated_inode;
+	rd.flags	   = flags;
+	error = vfs_rename(&rd);
 exit5:
 	dput(new_dentry);
 exit4:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 37d85046b4d6..f7d83ff2b44e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1798,7 +1798,13 @@ retry:
 		close_cached = true;
 		goto out_dput_old;
 	} else {
-		host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+		struct renamedata rd = {
+			.old_dir	= fdir,
+			.old_dentry	= odentry,
+			.new_dir	= tdir,
+			.new_dentry	= ndentry,
+		};
+		host_err = vfs_rename(&rd);
 		if (!host_err) {
 			host_err = commit_metadata(tfhp);
 			if (!host_err)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0002834f664a..426899681df7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -214,9 +214,16 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 				unsigned int flags)
 {
 	int err;
+	struct renamedata rd = {
+		.old_dir 	= olddir,
+		.old_dentry 	= olddentry,
+		.new_dir 	= newdir,
+		.new_dentry 	= newdentry,
+		.flags 		= flags,
+	};
 
 	pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
-	err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+	err = vfs_rename(&rd);
 	if (err) {
 		pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
 			 olddentry, newdentry, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a27884af7222..430e457f67f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1775,7 +1775,17 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+
+struct renamedata {
+	struct inode *old_dir;
+	struct dentry *old_dentry;
+	struct inode *new_dir;
+	struct dentry *new_dentry;
+	struct inode **delegated_inode;
+	unsigned int flags;
+} __randomize_layout;
+
+int vfs_rename(struct renamedata *);
 
 static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
 {
-- 
cgit v1.2.3


From 6521f8917082928a4cb637eb64b77b5f2f5b30fc Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:33 +0100
Subject: namei: prepare for idmapped mounts

The various vfs_*() helpers are called by filesystems or by the vfs
itself to perform core operations such as create, link, mkdir, mknod, rename,
rmdir, tmpfile and unlink. Enable them to handle idmapped mounts. If the
inode is accessed through an idmapped mount map it into the
mount's user namespace and pass it down. Afterwards the checks and
operations are identical to non-idmapped mounts. If the initial user
namespace is passed nothing changes so non-idmapped mounts will see
identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-15-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 drivers/base/devtmpfs.c  |  11 ++-
 fs/cachefiles/namei.c    |  12 ++-
 fs/ecryptfs/inode.c      |  33 ++++---
 fs/init.c                |  14 +--
 fs/namei.c               | 227 ++++++++++++++++++++++++++++++++++++++---------
 fs/nfsd/nfs4recover.c    |   6 +-
 fs/nfsd/vfs.c            |  19 ++--
 fs/overlayfs/dir.c       |   4 +-
 fs/overlayfs/overlayfs.h |  20 +++--
 include/linux/fs.h       |  32 ++++---
 ipc/mqueue.c             |   3 +-
 net/unix/af_unix.c       |   3 +-
 12 files changed, 279 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 2e0c3cdb4184..653c8c6ac7a7 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -162,7 +162,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mkdir(d_inode(path.dentry), dentry, mode);
+	err = vfs_mkdir(&init_user_ns, d_inode(path.dentry), dentry, mode);
 	if (!err)
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
@@ -212,7 +212,8 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);
+	err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry, mode,
+			dev->devt);
 	if (!err) {
 		struct iattr newattrs;
 
@@ -242,7 +243,8 @@ static int dev_rmdir(const char *name)
 		return PTR_ERR(dentry);
 	if (d_really_is_positive(dentry)) {
 		if (d_inode(dentry)->i_private == &thread)
-			err = vfs_rmdir(d_inode(parent.dentry), dentry);
+			err = vfs_rmdir(&init_user_ns, d_inode(parent.dentry),
+					dentry);
 		else
 			err = -EPERM;
 	} else {
@@ -330,7 +332,8 @@ static int handle_remove(const char *nodename, struct device *dev)
 			inode_lock(d_inode(dentry));
 			notify_change(&init_user_ns, dentry, &newattrs, NULL);
 			inode_unlock(d_inode(dentry));
-			err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
+			err = vfs_unlink(&init_user_ns, d_inode(parent.dentry),
+					 dentry, NULL);
 			if (!err || err == -ENOENT)
 				deleted = 1;
 		}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7b987de0babe..7bf0732ae25c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -311,7 +311,8 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 			cachefiles_io_error(cache, "Unlink security error");
 		} else {
 			trace_cachefiles_unlink(object, rep, why);
-			ret = vfs_unlink(d_inode(dir), rep, NULL);
+			ret = vfs_unlink(&init_user_ns, d_inode(dir), rep,
+					 NULL);
 
 			if (preemptive)
 				cachefiles_mark_object_buried(cache, rep, why);
@@ -413,8 +414,10 @@ try_again:
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		struct renamedata rd = {
+			.old_mnt_userns	= &init_user_ns,
 			.old_dir	= d_inode(dir),
 			.old_dentry	= rep,
+			.new_mnt_userns	= &init_user_ns,
 			.new_dir	= d_inode(cache->graveyard),
 			.new_dentry	= grave,
 		};
@@ -566,7 +569,7 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 			start = jiffies;
-			ret = vfs_mkdir(d_inode(dir), next, 0);
+			ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0);
 			cachefiles_hist(cachefiles_mkdir_histogram, start);
 			if (!key)
 				trace_cachefiles_mkdir(object, next, ret);
@@ -602,7 +605,8 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 			start = jiffies;
-			ret = vfs_create(d_inode(dir), next, S_IFREG, true);
+			ret = vfs_create(&init_user_ns, d_inode(dir), next,
+					 S_IFREG, true);
 			cachefiles_hist(cachefiles_create_histogram, start);
 			trace_cachefiles_create(object, next, ret);
 			if (ret < 0)
@@ -796,7 +800,7 @@ retry:
 		ret = security_path_mkdir(&path, subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
-		ret = vfs_mkdir(d_inode(dir), subdir, 0700);
+		ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ff48abb09679..73e3d47e7b2d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -141,7 +141,8 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 	else if (d_unhashed(lower_dentry))
 		rc = -EINVAL;
 	else
-		rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+		rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
+				NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
@@ -180,7 +181,8 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
+	rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+			mode, true);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
 		       "rc = [%d]\n", __func__, rc);
@@ -190,7 +192,8 @@ ecryptfs_do_create(struct inode *directory_inode,
 	inode = __ecryptfs_get_inode(d_inode(lower_dentry),
 				     directory_inode->i_sb);
 	if (IS_ERR(inode)) {
-		vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+		vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
+			   lower_dentry, NULL);
 		goto out_lock;
 	}
 	fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
@@ -436,8 +439,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	dget(lower_old_dentry);
 	dget(lower_new_dentry);
 	lower_dir_dentry = lock_parent(lower_new_dentry);
-	rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
-		      lower_new_dentry, NULL);
+	rc = vfs_link(lower_old_dentry, &init_user_ns,
+		      d_inode(lower_dir_dentry), lower_new_dentry, NULL);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -481,7 +484,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 						  strlen(symname));
 	if (rc)
 		goto out_lock;
-	rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
+	rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
 			 encoded_symname);
 	kfree(encoded_symname);
 	if (rc || d_really_is_negative(lower_dentry))
@@ -507,7 +510,8 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+	rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+		       mode);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -541,7 +545,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	else if (d_unhashed(lower_dentry))
 		rc = -EINVAL;
 	else
-		rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+		rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
 	if (!rc) {
 		clear_nlink(d_inode(dentry));
 		fsstack_copy_attr_times(dir, lower_dir_inode);
@@ -563,7 +567,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+	rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+		       mode, dev);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -621,10 +626,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_lock;
 	}
 
-	rd.old_dir	= d_inode(lower_old_dir_dentry);
-	rd.old_dentry	= lower_old_dentry;
-	rd.new_dir	= d_inode(lower_new_dir_dentry);
-	rd.new_dentry	= lower_new_dentry;
+	rd.old_mnt_userns	= &init_user_ns;
+	rd.old_dir		= d_inode(lower_old_dir_dentry);
+	rd.old_dentry		= lower_old_dentry;
+	rd.new_mnt_userns	= &init_user_ns;
+	rd.new_dir		= d_inode(lower_new_dir_dentry);
+	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
 	if (rc)
 		goto out_lock;
diff --git a/fs/init.c b/fs/init.c
index 891284f8a443..e65452750fa5 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -157,8 +157,8 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 		mode &= ~current_umask();
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
-		error = vfs_mknod(path.dentry->d_inode, dentry, mode,
-				  new_decode_dev(dev));
+		error = vfs_mknod(&init_user_ns, path.dentry->d_inode, dentry,
+				  mode, new_decode_dev(dev));
 	done_path_create(&path, dentry);
 	return error;
 }
@@ -187,8 +187,8 @@ int __init init_link(const char *oldname, const char *newname)
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry,
-			 NULL);
+	error = vfs_link(old_path.dentry, &init_user_ns,
+			 new_path.dentry->d_inode, new_dentry, NULL);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 out:
@@ -207,7 +207,8 @@ int __init init_symlink(const char *oldname, const char *newname)
 		return PTR_ERR(dentry);
 	error = security_path_symlink(&path, dentry, oldname);
 	if (!error)
-		error = vfs_symlink(path.dentry->d_inode, dentry, oldname);
+		error = vfs_symlink(&init_user_ns, path.dentry->d_inode, dentry,
+				    oldname);
 	done_path_create(&path, dentry);
 	return error;
 }
@@ -230,7 +231,8 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
-		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+		error = vfs_mkdir(&init_user_ns, path.dentry->d_inode, dentry,
+				  mode);
 	done_path_create(&path, dentry);
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 38ab51881247..5c9f6f8e90c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2874,10 +2874,26 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl)
+/**
+ * vfs_create - create new file
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new file
+ * @want_excl:	whether the file must not yet exist
+ *
+ * Create a new file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool want_excl)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 	if (error)
 		return error;
 
@@ -3353,7 +3369,23 @@ static int do_open(struct nameidata *nd,
 	return error;
 }
 
-struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+/**
+ * vfs_tmpfile - create tmpfile
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new tmpfile
+ * @open_flags:	flags
+ *
+ * Create a temporary file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, umode_t mode, int open_flag)
 {
 	struct dentry *child = NULL;
 	struct inode *dir = dentry->d_inode;
@@ -3361,7 +3393,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
 	int error;
 
 	/* we want directory to be writable */
-	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_err;
 	error = -EOPNOTSUPP;
@@ -3396,6 +3428,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *child;
 	struct path path;
 	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
@@ -3404,7 +3437,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	error = mnt_want_write(path.mnt);
 	if (unlikely(error))
 		goto out;
-	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+	mnt_userns = mnt_user_ns(path.mnt);
+	child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
 	error = PTR_ERR(child);
 	if (IS_ERR(child))
 		goto out2;
@@ -3616,10 +3650,27 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
 }
 EXPORT_SYMBOL(user_path_create);
 
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+/**
+ * vfs_mknod - create device node or file
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new device node or file
+ * @dev:	device number of device to create
+ *
+ * Create a device node or file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 
 	if (error)
 		return error;
@@ -3666,6 +3717,7 @@ static int may_mknod(umode_t mode)
 static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
 		unsigned int dev)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *dentry;
 	struct path path;
 	int error;
@@ -3684,18 +3736,22 @@ retry:
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (error)
 		goto out;
+
+	mnt_userns = mnt_user_ns(path.mnt);
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			error = vfs_create(mnt_userns, path.dentry->d_inode,
+					   dentry, mode, true);
 			if (!error)
 				ima_post_path_mknod(dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
-			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
-					new_decode_dev(dev));
+			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+					  dentry, mode, new_decode_dev(dev));
 			break;
 		case S_IFIFO: case S_IFSOCK:
-			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+					  dentry, mode, 0);
 			break;
 	}
 out:
@@ -3718,9 +3774,25 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 	return do_mknodat(AT_FDCWD, filename, mode, dev);
 }
 
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+/**
+ * vfs_mkdir - create directory
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new directory
+ *
+ * Create a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 	unsigned max_links = dir->i_sb->s_max_links;
 
 	if (error)
@@ -3759,8 +3831,11 @@ retry:
 	if (!IS_POSIXACL(path.dentry->d_inode))
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
-	if (!error)
-		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+	if (!error) {
+		struct user_namespace *mnt_userns;
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, mode);
+	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3779,9 +3854,24 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 	return do_mkdirat(AT_FDCWD, pathname, mode);
 }
 
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+/**
+ * vfs_rmdir - remove directory
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ *
+ * Remove a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry)
 {
-	int error = may_delete(&init_user_ns, dir, dentry, 1);
+	int error = may_delete(mnt_userns, dir, dentry, 1);
 
 	if (error)
 		return error;
@@ -3821,6 +3911,7 @@ EXPORT_SYMBOL(vfs_rmdir);
 
 long do_rmdir(int dfd, struct filename *name)
 {
+	struct user_namespace *mnt_userns;
 	int error = 0;
 	struct dentry *dentry;
 	struct path path;
@@ -3861,7 +3952,8 @@ retry:
 	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit3;
-	error = vfs_rmdir(path.dentry->d_inode, dentry);
+	mnt_userns = mnt_user_ns(path.mnt);
+	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
 exit3:
 	dput(dentry);
 exit2:
@@ -3884,6 +3976,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 
 /**
  * vfs_unlink - unlink a filesystem object
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dir:	parent directory
  * @dentry:	victim
  * @delegated_inode: returns victim inode, if the inode is delegated.
@@ -3899,11 +3992,18 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
-	int error = may_delete(&init_user_ns, dir, dentry, 0);
+	int error = may_delete(mnt_userns, dir, dentry, 0);
 
 	if (error)
 		return error;
@@ -3974,6 +4074,8 @@ retry_deleg:
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
+		struct user_namespace *mnt_userns;
+
 		/* Why not before? Because we want correct error value */
 		if (last.name[last.len])
 			goto slashes;
@@ -3984,7 +4086,8 @@ retry_deleg:
 		error = security_path_unlink(&path, dentry);
 		if (error)
 			goto exit2;
-		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode);
 exit2:
 		dput(dentry);
 	}
@@ -4033,9 +4136,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 	return do_unlinkat(AT_FDCWD, getname(pathname));
 }
 
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+/**
+ * vfs_symlink - create symlink
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @oldname:	name of the file to link to
+ *
+ * Create a symlink.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 
 	if (error)
 		return error;
@@ -4073,8 +4192,13 @@ retry:
 		goto out_putname;
 
 	error = security_path_symlink(&path, dentry, from->name);
-	if (!error)
-		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+	if (!error) {
+		struct user_namespace *mnt_userns;
+
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
+				    from->name);
+	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -4099,6 +4223,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
 /**
  * vfs_link - create a new link
  * @old_dentry:	object to be linked
+ * @mnt_userns:	the user namespace of the mount
  * @dir:	new parent
  * @new_dentry:	where to create the new link
  * @delegated_inode: returns inode needing a delegation break
@@ -4114,8 +4239,16 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
+	     struct inode *dir, struct dentry *new_dentry,
+	     struct inode **delegated_inode)
 {
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -4124,7 +4257,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(&init_user_ns, dir, new_dentry);
+	error = may_create(mnt_userns, dir, new_dentry);
 	if (error)
 		return error;
 
@@ -4141,7 +4274,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 * be writen back improperly if their true value is unknown to
 	 * the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(&init_user_ns, inode))
+	if (HAS_UNMAPPED_ID(mnt_userns, inode))
 		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
@@ -4188,6 +4321,7 @@ EXPORT_SYMBOL(vfs_link);
 static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
 	      const char __user *newname, int flags)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
 	struct inode *delegated_inode = NULL;
@@ -4229,7 +4363,9 @@ retry:
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+	mnt_userns = mnt_user_ns(new_path.mnt);
+	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+			 new_dentry, &delegated_inode);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 	if (delegated_inode) {
@@ -4263,12 +4399,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 
 /**
  * vfs_rename - rename a filesystem object
- * @old_dir:	parent of source
- * @old_dentry:	source
- * @new_dir:	parent of destination
- * @new_dentry:	destination
- * @delegated_inode: returns an inode needing a delegation break
- * @flags:	rename flags
+ * @old_mnt_userns:	old user namespace of the mount the inode was found from
+ * @old_dir:		parent of source
+ * @old_dentry:		source
+ * @new_mnt_userns:	new user namespace of the mount the inode was found from
+ * @new_dir:		parent of destination
+ * @new_dentry:		destination
+ * @delegated_inode:	returns an inode needing a delegation break
+ * @flags:		rename flags
  *
  * The caller must hold multiple mutexes--see lock_rename()).
  *
@@ -4314,7 +4452,6 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 int vfs_rename(struct renamedata *rd)
 {
 	int error;
-	struct user_namespace *mnt_userns = &init_user_ns;
 	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
 	struct dentry *old_dentry = rd->old_dentry;
 	struct dentry *new_dentry = rd->new_dentry;
@@ -4330,19 +4467,21 @@ int vfs_rename(struct renamedata *rd)
 	if (source == target)
 		return 0;
 
-	error = may_delete(mnt_userns, old_dir, old_dentry, is_dir);
+	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(mnt_userns, new_dir, new_dentry);
+		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(mnt_userns, new_dir, new_dentry, is_dir);
+			error = may_delete(rd->new_mnt_userns, new_dir,
+					   new_dentry, is_dir);
 		else
-			error = may_delete(mnt_userns, new_dir, new_dentry, new_is_dir);
+			error = may_delete(rd->new_mnt_userns, new_dir,
+					   new_dentry, new_is_dir);
 	}
 	if (error)
 		return error;
@@ -4356,13 +4495,13 @@ int vfs_rename(struct renamedata *rd)
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(&init_user_ns, source,
+			error = inode_permission(rd->old_mnt_userns, source,
 						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(&init_user_ns, target,
+			error = inode_permission(rd->new_mnt_userns, target,
 						 MAY_WRITE);
 			if (error)
 				return error;
@@ -4552,8 +4691,10 @@ retry_deleg:
 
 	rd.old_dir	   = old_path.dentry->d_inode;
 	rd.old_dentry	   = old_dentry;
+	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
 	rd.new_dir	   = new_path.dentry->d_inode;
 	rd.new_dentry	   = new_dentry;
+	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 186fa2c2c6ba..891395c6c7d3 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * as well be forgiving and just succeed silently.
 		 */
 		goto out_put;
-	status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
+	status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU);
 out_put:
 	dput(dentry);
 out_unlock:
@@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	status = -ENOENT;
 	if (d_really_is_negative(dentry))
 		goto out;
-	status = vfs_rmdir(d_inode(dir), dentry);
+	status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry);
 out:
 	dput(dentry);
 out_unlock:
@@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 	if (nfs4_has_reclaimed_state(name, nn))
 		goto out_free;
 
-	status = vfs_rmdir(d_inode(parent), child);
+	status = vfs_rmdir(&init_user_ns, d_inode(parent), child);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f7d83ff2b44e..fab873178140 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,12 +1255,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	host_err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+		host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
 		if (!host_err)
 			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
-		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
 		if (!host_err && unlikely(d_unhashed(dchild))) {
 			struct dentry *d;
 			d = lookup_one_len(dchild->d_name.name,
@@ -1288,7 +1288,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+		host_err = vfs_mknod(&init_user_ns, dirp, dchild,
+				     iap->ia_mode, rdev);
 		break;
 	default:
 		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
@@ -1486,7 +1487,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (!IS_POSIXACL(dirp))
 		iap->ia_mode &= ~current_umask();
 
-	host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+	host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
 	if (host_err < 0) {
 		fh_drop_write(fhp);
 		goto out_nfserr;
@@ -1610,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
-	host_err = vfs_symlink(d_inode(dentry), dnew, path);
+	host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
 	err = nfserrno(host_err);
 	if (!err)
 		err = nfserrno(commit_metadata(fhp));
@@ -1678,7 +1679,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = nfserr_noent;
 	if (d_really_is_negative(dold))
 		goto out_dput;
-	host_err = vfs_link(dold, dirp, dnew, NULL);
+	host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
 		if (!err)
@@ -1799,8 +1800,10 @@ retry:
 		goto out_dput_old;
 	} else {
 		struct renamedata rd = {
+			.old_mnt_userns	= &init_user_ns,
 			.old_dir	= fdir,
 			.old_dentry	= odentry,
+			.new_mnt_userns	= &init_user_ns,
 			.new_dir	= tdir,
 			.new_dentry	= ndentry,
 		};
@@ -1891,9 +1894,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (type != S_IFDIR) {
 		if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
 			nfsd_close_cached_files(rdentry);
-		host_err = vfs_unlink(dirp, rdentry, NULL);
+		host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
 	} else {
-		host_err = vfs_rmdir(dirp, rdentry);
+		host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
 	}
 
 	if (!host_err)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index d75c96cb18c3..6904cc2ed7bb 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -821,9 +821,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
 		goto out_dput_upper;
 
 	if (is_dir)
-		err = vfs_rmdir(dir, upper);
+		err = vfs_rmdir(&init_user_ns, dir, upper);
 	else
-		err = vfs_unlink(dir, upper, NULL);
+		err = vfs_unlink(&init_user_ns, dir, upper, NULL);
 	ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
 
 	/*
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 426899681df7..5e9eb46e741a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -123,7 +123,7 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
 
 static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_rmdir(dir, dentry);
+	int err = vfs_rmdir(&init_user_ns, dir, dentry);
 
 	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
 	return err;
@@ -131,7 +131,7 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 
 static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_unlink(dir, dentry, NULL);
+	int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
 
 	pr_debug("unlink(%pd2) = %i\n", dentry, err);
 	return err;
@@ -140,7 +140,7 @@ static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
 static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
 			      struct dentry *new_dentry)
 {
-	int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+	int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
 
 	pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
 	return err;
@@ -149,7 +149,7 @@ static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
 static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
 				umode_t mode)
 {
-	int err = vfs_create(dir, dentry, mode, true);
+	int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
 
 	pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
@@ -158,7 +158,7 @@ static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
 			       umode_t mode)
 {
-	int err = vfs_mkdir(dir, dentry, mode);
+	int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
 	pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
 }
@@ -166,7 +166,7 @@ static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
 			       umode_t mode, dev_t dev)
 {
-	int err = vfs_mknod(dir, dentry, mode, dev);
+	int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
 
 	pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
 	return err;
@@ -175,7 +175,7 @@ static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
 				 const char *oldname)
 {
-	int err = vfs_symlink(dir, dentry, oldname);
+	int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
 
 	pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
 	return err;
@@ -215,8 +215,10 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 {
 	int err;
 	struct renamedata rd = {
+		.old_mnt_userns	= &init_user_ns,
 		.old_dir 	= olddir,
 		.old_dentry 	= olddentry,
+		.new_mnt_userns	= &init_user_ns,
 		.new_dir 	= newdir,
 		.new_dentry 	= newdentry,
 		.flags 		= flags,
@@ -233,14 +235,14 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 
 static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_whiteout(dir, dentry);
+	int err = vfs_whiteout(&init_user_ns, dir, dentry);
 	pr_debug("whiteout(%pd2) = %i\n", dentry, err);
 	return err;
 }
 
 static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
 {
-	struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
+	struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
 	int err = PTR_ERR_OR_ZERO(ret);
 
 	pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 430e457f67f1..29d7b2fe7de4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1768,17 +1768,25 @@ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
 /*
  * VFS helper functions..
  */
-extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
-extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
-extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-extern int vfs_symlink(struct inode *, struct dentry *, const char *);
-extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
-extern int vfs_rmdir(struct inode *, struct dentry *);
-extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+int vfs_create(struct user_namespace *, struct inode *,
+	       struct dentry *, umode_t, bool);
+int vfs_mkdir(struct user_namespace *, struct inode *,
+	      struct dentry *, umode_t);
+int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+              umode_t, dev_t);
+int vfs_symlink(struct user_namespace *, struct inode *,
+		struct dentry *, const char *);
+int vfs_link(struct dentry *, struct user_namespace *, struct inode *,
+	     struct dentry *, struct inode **);
+int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
+int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
+	       struct inode **);
 
 struct renamedata {
+	struct user_namespace *old_mnt_userns;
 	struct inode *old_dir;
 	struct dentry *old_dentry;
+	struct user_namespace *new_mnt_userns;
 	struct inode *new_dir;
 	struct dentry *new_dentry;
 	struct inode **delegated_inode;
@@ -1787,13 +1795,15 @@ struct renamedata {
 
 int vfs_rename(struct renamedata *);
 
-static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+static inline int vfs_whiteout(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct dentry *dentry)
 {
-	return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+	return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE,
+			 WHITEOUT_DEV);
 }
 
-extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
-				  int open_flag);
+struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, umode_t mode, int open_flag);
 
 int vfs_mkobj(struct dentry *, umode_t,
 		int (*f)(struct dentry *, umode_t, void *),
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 693f01fe1216..fcd56e077733 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -965,7 +965,8 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 		err = -ENOENT;
 	} else {
 		ihold(inode);
-		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+		err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent),
+				 dentry, NULL);
 	}
 	dput(dentry);
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 18453d15dddf..9a1f3c04402e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -996,7 +996,8 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 	 */
 	err = security_path_mknod(&path, dentry, mode, 0);
 	if (!err) {
-		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+		err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry,
+				mode, 0);
 		if (!err) {
 			res->mnt = mntget(path.mnt);
 			res->dentry = dget(dentry);
-- 
cgit v1.2.3


From 643fe55a0679ae5582a1a2a1df86dc240292cd1b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:34 +0100
Subject: open: handle idmapped mounts in do_truncate()

When truncating files the vfs will verify that the caller is privileged
over the inode. Extend it to handle idmapped mounts. If the inode is
accessed through an idmapped mount it is mapped according to the mount's
user namespace. Afterwards the permissions checks are identical to
non-idmapped mounts. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-16-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/coredump.c      | 10 +++++++---
 fs/inode.c         |  7 ++++---
 fs/namei.c         |  2 +-
 fs/open.c          | 16 +++++++++-------
 include/linux/fs.h |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/coredump.c b/fs/coredump.c
index a2f6ecc8e345..ae778937a1ff 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto close_fail;
 		}
 	} else {
+		struct user_namespace *mnt_userns;
 		struct inode *inode;
 		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
 				 O_LARGEFILE | O_EXCL;
@@ -780,13 +781,15 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		 * a process dumps core while its cwd is e.g. on a vfat
 		 * filesystem.
 		 */
-		if (!uid_eq(inode->i_uid, current_fsuid()))
+		mnt_userns = file_mnt_user_ns(cprm.file);
+		if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid()))
 			goto close_fail;
 		if ((inode->i_mode & 0677) != 0600)
 			goto close_fail;
 		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 			goto close_fail;
-		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+		if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
+				0, 0, cprm.file))
 			goto close_fail;
 	}
 
@@ -931,7 +934,8 @@ void dump_truncate(struct coredump_params *cprm)
 	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
 		offset = file->f_op->llseek(file, 0, SEEK_CUR);
 		if (i_size_read(file->f_mapping->host) < offset)
-			do_truncate(file->f_path.dentry, offset, 0, file);
+			do_truncate(file_mnt_user_ns(file), file->f_path.dentry,
+				    offset, 0, file);
 	}
 }
 EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/inode.c b/fs/inode.c
index 46116ef44c9f..08151968c9ef 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1903,7 +1903,8 @@ int dentry_needs_remove_privs(struct dentry *dentry)
 	return mask;
 }
 
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1912,7 +1913,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(&init_user_ns, dentry, &newattrs, NULL);
+	return notify_change(mnt_userns, dentry, &newattrs, NULL);
 }
 
 /*
@@ -1939,7 +1940,7 @@ int file_remove_privs(struct file *file)
 	if (kill < 0)
 		return kill;
 	if (kill)
-		error = __remove_privs(dentry, kill);
+		error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index 5c9f6f8e90c4..c8c083daf368 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3009,7 +3009,7 @@ static int handle_truncate(struct file *filp)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error) {
-		error = do_truncate(path->dentry, 0,
+		error = do_truncate(&init_user_ns, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
diff --git a/fs/open.c b/fs/open.c
index c3e4dc43dd8d..8b3f3eb652d0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -35,8 +35,8 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-	struct file *filp)
+int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
+		loff_t length, unsigned int time_attrs, struct file *filp)
 {
 	int ret;
 	struct iattr newattrs;
@@ -61,13 +61,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 
 	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(&init_user_ns, dentry, &newattrs, NULL);
+	ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
 
 long vfs_truncate(const struct path *path, loff_t length)
 {
+	struct user_namespace *mnt_userns;
 	struct inode *inode;
 	long error;
 
@@ -83,7 +84,8 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	error = inode_permission(&init_user_ns, inode, MAY_WRITE);
+	mnt_userns = mnt_user_ns(path->mnt);
+	error = inode_permission(mnt_userns, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -107,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error)
-		error = do_truncate(path->dentry, length, 0, NULL);
+		error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
 
 put_write_and_out:
 	put_write_access(inode);
@@ -186,13 +188,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	/* Check IS_APPEND on real upper inode */
 	if (IS_APPEND(file_inode(f.file)))
 		goto out_putf;
-
 	sb_start_write(inode->i_sb);
 	error = locks_verify_truncate(inode, f.file, length);
 	if (!error)
 		error = security_path_truncate(&f.file->f_path);
 	if (!error)
-		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+		error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
+				    ATTR_MTIME | ATTR_CTIME, f.file);
 	sb_end_write(inode->i_sb);
 out_putf:
 	fdput(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d7b2fe7de4..f0601cca1930 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2593,8 +2593,8 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
 	return mnt_user_ns(file->f_path.mnt);
 }
 extern long vfs_truncate(const struct path *, loff_t);
-extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
-		       struct file *filp);
+int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
+		unsigned int time_attrs, struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
 			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
-- 
cgit v1.2.3


From 549c7297717c32ee53f156cd949e055e601f67bb Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:43 +0100
Subject: fs: make helpers idmap mount aware

Extend some inode methods with an additional user namespace argument. A
filesystem that is aware of idmapped mounts will receive the user
namespace the mount has been marked with. This can be used for
additional permission checking and also to enable filesystems to
translate between uids and gids if they need to. We have implemented all
relevant helpers in earlier patches.

As requested we simply extend the exisiting inode method instead of
introducing new ones. This is a little more code churn but it's mostly
mechanical and doesnt't leave us with additional inode methods.

Link: https://lore.kernel.org/r/20210121131959.646623-25-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 Documentation/filesystems/vfs.rst         | 19 ++++++-----
 arch/powerpc/platforms/cell/spufs/inode.c |  3 +-
 drivers/android/binderfs.c                |  6 ++--
 fs/9p/acl.c                               |  4 +--
 fs/9p/v9fs.h                              |  3 +-
 fs/9p/v9fs_vfs.h                          |  3 +-
 fs/9p/vfs_inode.c                         | 26 ++++++++-------
 fs/9p/vfs_inode_dotl.c                    | 31 +++++++++--------
 fs/adfs/adfs.h                            |  3 +-
 fs/adfs/inode.c                           |  3 +-
 fs/affs/affs.h                            | 24 +++++++++-----
 fs/affs/inode.c                           |  3 +-
 fs/affs/namei.c                           | 15 +++++----
 fs/afs/dir.c                              | 34 ++++++++++---------
 fs/afs/inode.c                            |  7 ++--
 fs/afs/internal.h                         |  7 ++--
 fs/afs/security.c                         |  3 +-
 fs/attr.c                                 |  4 +--
 fs/autofs/root.c                          | 17 ++++++----
 fs/bad_inode.c                            | 36 ++++++++++++--------
 fs/bfs/dir.c                              | 10 +++---
 fs/btrfs/acl.c                            |  3 +-
 fs/btrfs/ctree.h                          |  3 +-
 fs/btrfs/inode.c                          | 33 +++++++++++--------
 fs/ceph/acl.c                             |  3 +-
 fs/ceph/dir.c                             | 23 ++++++-------
 fs/ceph/inode.c                           | 10 +++---
 fs/ceph/super.h                           | 12 ++++---
 fs/cifs/cifsfs.c                          |  3 +-
 fs/cifs/cifsfs.h                          | 25 ++++++++------
 fs/cifs/dir.c                             |  8 ++---
 fs/cifs/inode.c                           | 16 +++++----
 fs/cifs/link.c                            |  3 +-
 fs/coda/coda_linux.h                      |  8 +++--
 fs/coda/dir.c                             | 18 ++++++----
 fs/coda/inode.c                           |  7 ++--
 fs/coda/pioctl.c                          |  6 ++--
 fs/configfs/configfs_internal.h           |  6 ++--
 fs/configfs/dir.c                         |  3 +-
 fs/configfs/inode.c                       |  5 +--
 fs/configfs/symlink.c                     |  3 +-
 fs/debugfs/inode.c                        |  9 ++---
 fs/ecryptfs/inode.c                       | 30 ++++++++++-------
 fs/efivarfs/inode.c                       |  4 +--
 fs/erofs/inode.c                          |  5 +--
 fs/erofs/internal.h                       |  5 +--
 fs/exfat/exfat_fs.h                       |  8 +++--
 fs/exfat/file.c                           |  8 +++--
 fs/exfat/namei.c                          | 14 ++++----
 fs/ext2/acl.c                             |  3 +-
 fs/ext2/acl.h                             |  3 +-
 fs/ext2/ext2.h                            |  5 +--
 fs/ext2/inode.c                           |  7 ++--
 fs/ext2/namei.c                           | 22 ++++++++-----
 fs/ext4/acl.c                             |  3 +-
 fs/ext4/acl.h                             |  3 +-
 fs/ext4/ext4.h                            |  9 +++--
 fs/ext4/inode.c                           | 12 ++++---
 fs/ext4/namei.c                           | 19 ++++++-----
 fs/f2fs/acl.c                             |  3 +-
 fs/f2fs/acl.h                             |  3 +-
 fs/f2fs/f2fs.h                            |  7 ++--
 fs/f2fs/file.c                            |  7 ++--
 fs/f2fs/namei.c                           | 21 +++++++-----
 fs/fat/fat.h                              |  6 ++--
 fs/fat/file.c                             |  9 ++---
 fs/fat/namei_msdos.c                      | 10 +++---
 fs/fat/namei_vfat.c                       | 13 ++++----
 fs/fuse/acl.c                             |  3 +-
 fs/fuse/dir.c                             | 37 ++++++++++++---------
 fs/fuse/fuse_i.h                          |  4 +--
 fs/gfs2/acl.c                             |  3 +-
 fs/gfs2/acl.h                             |  3 +-
 fs/gfs2/file.c                            |  2 +-
 fs/gfs2/inode.c                           | 53 +++++++++++++++++------------
 fs/gfs2/inode.h                           |  3 +-
 fs/hfs/dir.c                              | 13 ++++----
 fs/hfs/hfs_fs.h                           |  3 +-
 fs/hfs/inode.c                            |  6 ++--
 fs/hfsplus/dir.c                          | 22 +++++++------
 fs/hfsplus/hfsplus_fs.h                   |  5 +--
 fs/hfsplus/inode.c                        |  8 +++--
 fs/hostfs/hostfs_kern.c                   | 23 ++++++++-----
 fs/hpfs/hpfs_fn.h                         |  2 +-
 fs/hpfs/inode.c                           |  3 +-
 fs/hpfs/namei.c                           | 20 ++++++-----
 fs/hugetlbfs/inode.c                      | 29 +++++++++-------
 fs/jffs2/acl.c                            |  3 +-
 fs/jffs2/acl.h                            |  3 +-
 fs/jffs2/dir.c                            | 33 +++++++++++--------
 fs/jffs2/fs.c                             |  3 +-
 fs/jffs2/os-linux.h                       |  2 +-
 fs/jfs/acl.c                              |  3 +-
 fs/jfs/file.c                             |  3 +-
 fs/jfs/jfs_acl.h                          |  3 +-
 fs/jfs/jfs_inode.h                        |  2 +-
 fs/jfs/namei.c                            | 21 ++++++------
 fs/kernfs/dir.c                           |  6 ++--
 fs/kernfs/inode.c                         |  9 +++--
 fs/kernfs/kernfs-internal.h               |  9 +++--
 fs/libfs.c                                | 31 +++++++++--------
 fs/minix/file.c                           |  3 +-
 fs/minix/inode.c                          |  4 +--
 fs/minix/minix.h                          |  3 +-
 fs/minix/namei.c                          | 24 ++++++++------
 fs/namei.c                                | 55 ++++++++++++++++++-------------
 fs/nfs/dir.c                              | 23 ++++++++-----
 fs/nfs/inode.c                            |  7 ++--
 fs/nfs/internal.h                         | 14 +++++---
 fs/nfs/namespace.c                        | 13 +++++---
 fs/nfs/nfs3_fs.h                          |  3 +-
 fs/nfs/nfs3acl.c                          |  3 +-
 fs/nilfs2/inode.c                         |  6 ++--
 fs/nilfs2/namei.c                         | 19 ++++++-----
 fs/nilfs2/nilfs.h                         |  6 ++--
 fs/ntfs/inode.c                           |  4 ++-
 fs/ntfs/inode.h                           |  3 +-
 fs/ocfs2/acl.c                            |  3 +-
 fs/ocfs2/acl.h                            |  3 +-
 fs/ocfs2/dlmfs/dlmfs.c                    |  9 +++--
 fs/ocfs2/file.c                           | 10 +++---
 fs/ocfs2/file.h                           | 11 ++++---
 fs/ocfs2/namei.c                          | 19 +++++++----
 fs/omfs/dir.c                             | 13 ++++----
 fs/omfs/file.c                            |  3 +-
 fs/orangefs/acl.c                         |  3 +-
 fs/orangefs/inode.c                       | 10 +++---
 fs/orangefs/namei.c                       | 12 ++++---
 fs/orangefs/orangefs-kernel.h             | 13 +++++---
 fs/overlayfs/dir.c                        | 21 ++++++------
 fs/overlayfs/inode.c                      | 10 +++---
 fs/overlayfs/overlayfs.h                  | 10 +++---
 fs/overlayfs/super.c                      |  2 +-
 fs/posix_acl.c                            |  9 ++---
 fs/proc/base.c                            | 16 +++++----
 fs/proc/fd.c                              |  3 +-
 fs/proc/fd.h                              |  3 +-
 fs/proc/generic.c                         |  6 ++--
 fs/proc/internal.h                        |  6 ++--
 fs/proc/proc_net.c                        |  3 +-
 fs/proc/proc_sysctl.c                     |  9 +++--
 fs/proc/root.c                            |  3 +-
 fs/ramfs/file-nommu.c                     |  5 +--
 fs/ramfs/inode.c                          | 16 +++++----
 fs/reiserfs/acl.h                         |  3 +-
 fs/reiserfs/inode.c                       |  3 +-
 fs/reiserfs/namei.c                       | 19 ++++++-----
 fs/reiserfs/reiserfs.h                    |  3 +-
 fs/reiserfs/xattr.c                       | 11 ++++---
 fs/reiserfs/xattr.h                       |  3 +-
 fs/reiserfs/xattr_acl.c                   |  3 +-
 fs/stat.c                                 |  8 +++--
 fs/sysv/file.c                            |  3 +-
 fs/sysv/itree.c                           |  4 +--
 fs/sysv/namei.c                           | 21 +++++++-----
 fs/sysv/sysv.h                            |  3 +-
 fs/tracefs/inode.c                        |  4 ++-
 fs/ubifs/dir.c                            | 26 ++++++++-------
 fs/ubifs/file.c                           |  3 +-
 fs/ubifs/ubifs.h                          |  5 +--
 fs/udf/file.c                             |  3 +-
 fs/udf/namei.c                            | 24 +++++++-------
 fs/udf/symlink.c                          |  5 +--
 fs/ufs/inode.c                            |  3 +-
 fs/ufs/namei.c                            | 19 ++++++-----
 fs/ufs/ufs.h                              |  3 +-
 fs/vboxsf/dir.c                           | 12 ++++---
 fs/vboxsf/utils.c                         |  7 ++--
 fs/vboxsf/vfsmod.h                        |  8 +++--
 fs/xfs/xfs_acl.c                          |  3 +-
 fs/xfs/xfs_acl.h                          |  3 +-
 fs/xfs/xfs_iops.c                         | 52 ++++++++++++++++-------------
 fs/zonefs/super.c                         |  3 +-
 include/linux/fs.h                        | 39 ++++++++++++++--------
 include/linux/nfs_fs.h                    |  7 ++--
 include/linux/posix_acl.h                 |  3 +-
 ipc/mqueue.c                              |  4 +--
 kernel/bpf/inode.c                        |  7 ++--
 mm/shmem.c                                | 39 ++++++++++++++--------
 net/socket.c                              |  5 +--
 security/apparmor/apparmorfs.c            |  3 +-
 security/integrity/evm/evm_secfs.c        |  2 +-
 182 files changed, 1121 insertions(+), 756 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index ca52c82e5bb5..98290ef311ca 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -415,28 +415,29 @@ As of kernel 2.6.22, the following members are defined:
 .. code-block:: c
 
 	struct inode_operations {
-		int (*create) (struct inode *,struct dentry *, umode_t, bool);
+		int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool);
 		struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 		int (*link) (struct dentry *,struct inode *,struct dentry *);
 		int (*unlink) (struct inode *,struct dentry *);
-		int (*symlink) (struct inode *,struct dentry *,const char *);
-		int (*mkdir) (struct inode *,struct dentry *,umode_t);
+		int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *);
+		int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t);
 		int (*rmdir) (struct inode *,struct dentry *);
-		int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-		int (*rename) (struct inode *, struct dentry *,
+		int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t);
+		int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
 			       struct inode *, struct dentry *, unsigned int);
 		int (*readlink) (struct dentry *, char __user *,int);
 		const char *(*get_link) (struct dentry *, struct inode *,
 					 struct delayed_call *);
-		int (*permission) (struct inode *, int);
+		int (*permission) (struct user_namespace *, struct inode *, int);
 		int (*get_acl)(struct inode *, int);
-		int (*setattr) (struct dentry *, struct iattr *);
-		int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+		int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *);
+		int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int);
 		ssize_t (*listxattr) (struct dentry *, char *, size_t);
 		void (*update_time)(struct inode *, struct timespec *, int);
 		int (*atomic_open)(struct inode *, struct dentry *, struct file *,
 				   unsigned open_flag, umode_t create_mode);
-		int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+		int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t);
+	        int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int);
 	};
 
 Again, all methods are called without any locks being held, unless
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 3de526eb2275..b83a3670bd74 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -91,7 +91,8 @@ out:
 }
 
 static int
-spufs_setattr(struct dentry *dentry, struct iattr *attr)
+spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	      struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 7b4f154f07e6..e80ba93c62a9 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -355,7 +355,8 @@ static inline bool is_binderfs_control_device(const struct dentry *dentry)
 	return info->control_dentry == dentry;
 }
 
-static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int binderfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags)
 {
@@ -363,7 +364,8 @@ static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    is_binderfs_control_device(new_dentry))
 		return -EPERM;
 
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+	return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir,
+			     new_dentry, flags);
 }
 
 static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 1c14f18a6ec9..bb1b286c49ae 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -280,7 +280,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			struct iattr iattr = { 0 };
 			struct posix_acl *old_acl = acl;
 
-			retval = posix_acl_update_mode(mnt_userns, inode,
+			retval = posix_acl_update_mode(&init_user_ns, inode,
 						       &iattr.ia_mode, &acl);
 			if (retval)
 				goto err_out;
@@ -299,7 +299,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			 * What is the following setxattr update the
 			 * mode ?
 			 */
-			v9fs_vfs_setattr_dotl(dentry, &iattr);
+			v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 7b763776306e..4ca56c5dd637 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -135,7 +135,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 			unsigned int flags);
 extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
-extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+extern int v9fs_vfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags);
 extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd2a2b040250..d44ade76966a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 
 void v9fs_blank_wstat(struct p9_wstat *wstat);
-int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *,
+			  struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 			 int datasync);
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c21b146c8d91..648eb4c4cf7f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -676,8 +676,8 @@ error:
  */
 
 static int
-v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	u32 perm = unixmode2p9mode(v9ses, mode);
@@ -702,7 +702,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int err;
 	u32 perm;
@@ -907,9 +908,9 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
  */
 
 int
-v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		struct dentry *old_dentry, struct inode *new_dir,
+		struct dentry *new_dentry, unsigned int flags)
 {
 	int retval;
 	struct inode *old_inode;
@@ -1016,8 +1017,8 @@ done:
  */
 
 static int
-v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
@@ -1054,7 +1055,8 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
  *
  */
 
-static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *iattr)
 {
 	int retval, use_dentry = 0;
 	struct v9fs_session_info *v9ses;
@@ -1295,7 +1297,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
  */
 
 static int
-v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		 struct dentry *dentry, const char *symname)
 {
 	p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
 		 dir->i_ino, dentry, symname);
@@ -1348,7 +1351,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
  */
 
 static int
-v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	int retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 984f28315d2a..1dc7af046615 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -33,8 +33,8 @@
 #include "acl.h"
 
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		    dev_t rdev);
+v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dentry, umode_t omode, dev_t rdev);
 
 /**
  * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
@@ -218,10 +218,10 @@ int v9fs_open_to_dotl_flags(int flags)
  */
 
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		bool excl)
+v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t omode, bool excl)
 {
-	return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0);
 }
 
 static int
@@ -367,8 +367,9 @@ err_clunk_old_fid:
  *
  */
 
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-			       struct dentry *dentry, umode_t omode)
+static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct dentry *dentry,
+			       umode_t omode)
 {
 	int err;
 	struct v9fs_session_info *v9ses;
@@ -457,8 +458,9 @@ error:
 }
 
 static int
-v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
@@ -540,7 +542,8 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
  *
  */
 
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *iattr)
 {
 	int retval, use_dentry = 0;
 	struct p9_fid *fid = NULL;
@@ -684,8 +687,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 }
 
 static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-		const char *symname)
+v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, const char *symname)
 {
 	int err;
 	kgid_t gid;
@@ -824,8 +827,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
  *
  */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		dev_t rdev)
+v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dentry, umode_t omode, dev_t rdev)
 {
 	int err;
 	kgid_t gid;
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 699c4fa8b78b..06b7c92343ad 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -144,7 +144,8 @@ struct adfs_discmap {
 /* Inode stuff */
 struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
 int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
+int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 
 /* map.c */
 int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset);
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 278dcee6ae22..fb7ee026d101 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -292,7 +292,8 @@ out:
  * later.
  */
 int
-adfs_notify_change(struct dentry *dentry, struct iattr *attr)
+adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a755bef7c4c7..bfa89e131ead 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -167,27 +167,33 @@ extern const struct export_operations affs_export_ops;
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
 extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int	affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool);
-extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+extern int	affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool);
+extern int	affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
 			  struct dentry *dentry);
-extern int	affs_symlink(struct inode *dir, struct dentry *dentry,
-			     const char *symname);
-extern int	affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			    struct inode *new_dir, struct dentry *new_dentry,
-			    unsigned int flags);
+extern int	affs_symlink(struct user_namespace *mnt_userns,
+			struct inode *dir, struct dentry *dentry,
+			const char *symname);
+extern int	affs_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags);
 
 /* inode.c */
 
 extern struct inode		*affs_new_inode(struct inode *dir);
-extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
+extern int			 affs_notify_change(struct user_namespace *mnt_userns,
+					struct dentry *dentry, struct iattr *attr);
 extern void			 affs_evict_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
 					unsigned long ino);
 extern int			 affs_write_inode(struct inode *inode,
 					struct writeback_control *wbc);
-extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
+extern int			 affs_add_entry(struct inode *dir, struct inode *inode,
+					struct dentry *dentry, s32 type);
 
 /* file.c */
 
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 767e5bdfb703..2352a75bd9d6 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -216,7 +216,8 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 }
 
 int
-affs_notify_change(struct dentry *dentry, struct iattr *attr)
+affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 41c5749f4db7..9ad22befce28 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -242,7 +242,8 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
 }
 
 int
-affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode	*inode;
@@ -273,7 +274,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
 }
 
 int
-affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	   struct dentry *dentry, umode_t mode)
 {
 	struct inode		*inode;
 	int			 error;
@@ -311,7 +313,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
 }
 
 int
-affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+affs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+	     struct dentry *dentry, const char *symname)
 {
 	struct super_block	*sb = dir->i_sb;
 	struct buffer_head	*bh;
@@ -498,9 +501,9 @@ done:
 	return retval;
 }
 
-int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry,
-			unsigned int flags)
+int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+		 struct dentry *old_dentry, struct inode *new_dir,
+		 struct dentry *new_dentry, unsigned int flags)
 {
 
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7bd659ad959e..714fcca9af99 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -28,18 +28,19 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int
 				  loff_t fpos, u64 ino, unsigned dtype);
 static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
 			      loff_t fpos, u64 ino, unsigned dtype);
-static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl);
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl);
+static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode);
 static int afs_rmdir(struct inode *dir, struct dentry *dentry);
 static int afs_unlink(struct inode *dir, struct dentry *dentry);
 static int afs_link(struct dentry *from, struct inode *dir,
 		    struct dentry *dentry);
-static int afs_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *content);
-static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags);
+static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *content);
+static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags);
 static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
 static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
 				   unsigned int length);
@@ -1325,7 +1326,8 @@ static const struct afs_operation_ops afs_mkdir_operation = {
 /*
  * create a directory on an AFS filesystem
  */
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1619,8 +1621,8 @@ static const struct afs_operation_ops afs_create_operation = {
 /*
  * create a regular file on an AFS filesystem
  */
-static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1741,8 +1743,8 @@ static const struct afs_operation_ops afs_symlink_operation = {
 /*
  * create a symlink in an AFS filesystem
  */
-static int afs_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *content)
+static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *content)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1876,9 +1878,9 @@ static const struct afs_operation_ops afs_rename_operation = {
 /*
  * rename a file in an AFS filesystem and/or move it between directories
  */
-static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct afs_operation *op;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 795ee5cb3817..1156b2df28d3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -734,8 +734,8 @@ error_unlock:
 /*
  * read the attributes of an inode
  */
-int afs_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -857,7 +857,8 @@ static const struct afs_operation_ops afs_setattr_operation = {
 /*
  * set the attributes of an inode
  */
-int afs_setattr(struct dentry *dentry, struct iattr *attr)
+int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct afs_operation *op;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0d150a29e39e..b626e38e9ab5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1149,8 +1149,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
 extern struct inode *afs_root_iget(struct super_block *, struct key *);
 extern bool afs_check_validity(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int afs_setattr(struct dentry *, struct iattr *);
+extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *,
+		       struct kstat *, u32, unsigned int);
+extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *);
 extern void afs_evict_inode(struct inode *);
 extern int afs_drop_inode(struct inode *);
 
@@ -1361,7 +1362,7 @@ extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
 extern struct key *afs_request_key_rcu(struct afs_cell *);
 extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct user_namespace *, struct inode *, int);
 extern void __exit afs_clean_up_permit_cache(void);
 
 /*
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 9cf3102f370c..3c7a8fc4f93f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -396,7 +396,8 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		   int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t access;
diff --git a/fs/attr.c b/fs/attr.c
index c2d3bc0869d4..41abd0d973d8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -395,9 +395,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return error;
 
 	if (inode->i_op->setattr)
-		error = inode->i_op->setattr(dentry, attr);
+		error = inode->i_op->setattr(mnt_userns, dentry, attr);
 	else
-		error = simple_setattr(dentry, attr);
+		error = simple_setattr(mnt_userns, dentry, attr);
 
 	if (!error) {
 		fsnotify_change(dentry, ia_valid);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 5aaa1732bf1e..91fe4548c256 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,10 +10,12 @@
 
 #include "autofs_i.h"
 
-static int autofs_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs_dir_symlink(struct user_namespace *, struct inode *,
+			      struct dentry *, const char *);
 static int autofs_dir_unlink(struct inode *, struct dentry *);
 static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static int autofs_dir_mkdir(struct user_namespace *, struct inode *,
+			    struct dentry *, umode_t);
 static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
 #ifdef CONFIG_COMPAT
 static long autofs_root_compat_ioctl(struct file *,
@@ -524,9 +526,9 @@ static struct dentry *autofs_lookup(struct inode *dir,
 	return NULL;
 }
 
-static int autofs_dir_symlink(struct inode *dir,
-			       struct dentry *dentry,
-			       const char *symname)
+static int autofs_dir_symlink(struct user_namespace *mnt_userns,
+			      struct inode *dir, struct dentry *dentry,
+			      const char *symname)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -715,8 +717,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs_dir_mkdir(struct inode *dir,
-			    struct dentry *dentry, umode_t mode)
+static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 54f0ce444272..48e16144c1f7 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -27,8 +27,9 @@ static const struct file_operations bad_file_ops =
 	.open		= bad_file_open,
 };
 
-static int bad_inode_create (struct inode *dir, struct dentry *dentry,
-		umode_t mode, bool excl)
+static int bad_inode_create(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode, bool excl)
 {
 	return -EIO;
 }
@@ -50,14 +51,15 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
 	return -EIO;
 }
 
-static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
-		const char *symname)
+static int bad_inode_symlink(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     const char *symname)
 {
 	return -EIO;
 }
 
-static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
-			umode_t mode)
+static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode)
 {
 	return -EIO;
 }
@@ -67,13 +69,14 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
 	return -EIO;
 }
 
-static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
-			umode_t mode, dev_t rdev)
+static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	return -EIO;
 }
 
-static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int bad_inode_rename2(struct user_namespace *mnt_userns,
+			     struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry,
 			     unsigned int flags)
 {
@@ -86,18 +89,21 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct user_namespace *mnt_userns,
+				struct inode *inode, int mask)
 {
 	return -EIO;
 }
 
-static int bad_inode_getattr(const struct path *path, struct kstat *stat,
+static int bad_inode_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	return -EIO;
 }
 
-static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+static int bad_inode_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *direntry, struct iattr *attrs)
 {
 	return -EIO;
 }
@@ -140,13 +146,15 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
 	return -EIO;
 }
 
-static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry,
+static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *inode, struct dentry *dentry,
 			     umode_t mode)
 {
 	return -EIO;
 }
 
-static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl,
+static int bad_inode_set_acl(struct user_namespace *mnt_userns,
+			     struct inode *inode, struct posix_acl *acl,
 			     int type)
 {
 	return -EIO;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index be1335a8d25b..34d4f68f786b 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -75,8 +75,8 @@ const struct file_operations bfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-						bool excl)
+static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	int err;
 	struct inode *inode;
@@ -199,9 +199,9 @@ out_brelse:
 	return error;
 }
 
-static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index d12a5a8730a8..d95eb5c8cb37 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -107,7 +107,8 @@ out:
 	return ret;
 }
 
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type)
 {
 	int ret;
 	umode_t old_mode = inode->i_mode;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e6e37591f1de..9c0b43853cd2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3625,7 +3625,8 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type);
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir);
 #else
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a63faed171de..c0b11db98e5e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5045,7 +5045,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	return ret;
 }
 
-static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+			 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6352,8 +6353,8 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	return err;
 }
 
-static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
-			umode_t mode, dev_t rdev)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -6416,8 +6417,8 @@ out_unlock:
 	return err;
 }
 
-static int btrfs_create(struct inode *dir, struct dentry *dentry,
-			umode_t mode, bool excl)
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -6561,7 +6562,8 @@ fail:
 	return err;
 }
 
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct inode *inode = NULL;
@@ -8816,7 +8818,8 @@ fail:
 	return -ENOMEM;
 }
 
-static int btrfs_getattr(const struct path *path, struct kstat *stat,
+static int btrfs_getattr(struct user_namespace *mnt_userns,
+			 const struct path *path, struct kstat *stat,
 			 u32 request_mask, unsigned int flags)
 {
 	u64 delalloc_bytes;
@@ -9333,9 +9336,9 @@ out_notrans:
 	return ret;
 }
 
-static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			 struct inode *new_dir, struct dentry *new_dentry,
-			 unsigned int flags)
+static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+			 struct dentry *old_dentry, struct inode *new_dir,
+			 struct dentry *new_dentry, unsigned int flags)
 {
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
@@ -9543,8 +9546,8 @@ out:
 	return ret;
 }
 
-static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -9878,7 +9881,8 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct user_namespace *mnt_userns,
+			    struct inode *inode, int mask)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	umode_t mode = inode->i_mode;
@@ -9893,7 +9897,8 @@ static int btrfs_permission(struct inode *inode, int mask)
 	return generic_permission(&init_user_ns, inode, mask);
 }
 
-static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 52a01ddbc4ac..529af59d9fd3 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -82,7 +82,8 @@ retry:
 	return acl;
 }
 
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	int ret = 0, size = 0;
 	const char *name = NULL;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 858ee7362ff5..83d9358854fb 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -823,8 +823,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 	return PTR_ERR(result);
 }
 
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
+static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -878,14 +878,14 @@ out:
 	return err;
 }
 
-static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
-	return ceph_mknod(dir, dentry, mode, 0);
+	return ceph_mknod(mnt_userns, dir, dentry, mode, 0);
 }
 
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
-			    const char *dest)
+static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *dest)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -937,7 +937,8 @@ out:
 	return err;
 }
 
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -1183,9 +1184,9 @@ out:
 	return err;
 }
 
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
 	struct ceph_mds_request *req;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 179a2bb88538..d6ece56d40e8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2238,7 +2238,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 /*
  * setattr
  */
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -2321,7 +2322,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	int err;
 
@@ -2368,8 +2370,8 @@ static int statx_to_caps(u32 want, umode_t mode)
  * Get all the attributes. If we have sufficient caps for the requested attrs,
  * then we can avoid talking to the MDS at all.
  */
-int ceph_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b62d8fee3b86..1ef0a2a15817 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -973,10 +973,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
 {
 	return __ceph_do_getattr(inode, NULL, mask, force);
 }
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask);
 extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(const struct path *path, struct kstat *stat,
+extern int ceph_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags);
 
 /* xattr.c */
@@ -1037,7 +1040,8 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 
 struct posix_acl *ceph_get_acl(struct inode *, int);
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_set_acl(struct user_namespace *mnt_userns,
+		 struct inode *inode, struct posix_acl *acl, int type);
 int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		       struct ceph_acl_sec_ctx *as_ctx);
 void ceph_init_inode_acls(struct inode *inode,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ce14e6f8adb6..39e51dcf796f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -305,7 +305,8 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
 	return -EOPNOTSUPP;
 }
 
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2307bb0f6147..71e9c6abb2a6 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -62,19 +62,22 @@ extern void cifs_sb_deactive(struct super_block *sb);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
 extern struct inode *cifs_root_iget(struct super_block *);
-extern int cifs_create(struct inode *, struct dentry *, umode_t,
-		       bool excl);
+extern int cifs_create(struct user_namespace *, struct inode *,
+		       struct dentry *, umode_t, bool excl);
 extern int cifs_atomic_open(struct inode *, struct dentry *,
 			    struct file *, unsigned, umode_t);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 				  unsigned int);
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
-extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t, dev_t);
+extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
-			struct dentry *, unsigned int);
+extern int cifs_rename2(struct user_namespace *, struct inode *,
+			struct dentry *, struct inode *, struct dentry *,
+			unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
@@ -82,8 +85,10 @@ extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_revalidate_mapping(struct inode *inode);
 extern int cifs_zap_mapping(struct inode *inode);
-extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int cifs_setattr(struct dentry *, struct iattr *);
+extern int cifs_getattr(struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
+extern int cifs_setattr(struct user_namespace *, struct dentry *,
+			struct iattr *);
 extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start,
 		       u64 len);
 
@@ -132,8 +137,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 /* Functions related to symlinks */
 extern const char *cifs_get_link(struct dentry *, struct inode *,
 			struct delayed_call *);
-extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
-			const char *symname);
+extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
+			struct dentry *direntry, const char *symname);
 
 #ifdef CONFIG_CIFS_XATTR
 extern const struct xattr_handler *cifs_xattr_handlers[];
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 68900f1629bf..68f4f8536e6a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -567,8 +567,8 @@ out_free_xid:
 	return rc;
 }
 
-int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
-		bool excl)
+int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
+		struct dentry *direntry, umode_t mode, bool excl)
 {
 	int rc;
 	unsigned int xid = get_xid();
@@ -611,8 +611,8 @@ out_free_xid:
 	return rc;
 }
 
-int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
-		dev_t device_number)
+int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
+	       struct dentry *direntry, umode_t mode, dev_t device_number)
 {
 	int rc = -EPERM;
 	unsigned int xid;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 374abce7efaf..3e9c7bb23f26 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1857,7 +1857,8 @@ posix_mkdir_get_info:
 	goto posix_mkdir_out;
 }
 
-int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
+int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
+	       struct dentry *direntry, umode_t mode)
 {
 	int rc = 0;
 	unsigned int xid;
@@ -2067,9 +2068,9 @@ do_rename_exit:
 }
 
 int
-cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
-	     struct inode *target_dir, struct dentry *target_dentry,
-	     unsigned int flags)
+cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
+	     struct dentry *source_dentry, struct inode *target_dir,
+	     struct dentry *target_dentry, unsigned int flags)
 {
 	char *from_name = NULL;
 	char *to_name = NULL;
@@ -2370,8 +2371,8 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 	return cifs_revalidate_mapping(inode);
 }
 
-int cifs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
@@ -2923,7 +2924,8 @@ cifs_setattr_exit:
 }
 
 int
-cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
+	     struct iattr *attrs)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
 	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 94dab4309fbb..7c5878a645d9 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -661,7 +661,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode,
 }
 
 int
-cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
+cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct dentry *direntry, const char *symname)
 {
 	int rc = -EOPNOTSUPP;
 	unsigned int xid;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d5ebd36fb2cc..e7b27754ce78 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -46,10 +46,12 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask);
+int coda_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask);
 int coda_revalidate_inode(struct inode *);
-int coda_getattr(const struct path *, struct kstat *, u32, unsigned int);
-int coda_setattr(struct dentry *, struct iattr *);
+int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
+		 u32, unsigned int);
+int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 /* this file:  heloers */
 char *coda_f2s(struct CodaFid *f);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ca40c2556ba6..d69989c1bac3 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -73,7 +73,8 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig
 }
 
 
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	int error;
 
@@ -132,7 +133,8 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 }
 
 /* creation routines: create, mknod, mkdir, link, symlink */
-static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl)
+static int coda_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *de, umode_t mode, bool excl)
 {
 	int error;
 	const char *name=de->d_name.name;
@@ -164,7 +166,8 @@ err_out:
 	return error;
 }
 
-static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
+static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *de, umode_t mode)
 {
 	struct inode *inode;
 	struct coda_vattr attrs;
@@ -225,7 +228,8 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 }
 
 
-static int coda_symlink(struct inode *dir_inode, struct dentry *de,
+static int coda_symlink(struct user_namespace *mnt_userns,
+			struct inode *dir_inode, struct dentry *de,
 			const char *symname)
 {
 	const char *name = de->d_name.name;
@@ -291,9 +295,9 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 }
 
 /* rename */
-static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	const char *old_name = old_dentry->d_name.name;
 	const char *new_name = new_dentry->d_name.name;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 4d113e191cb8..d9f1bd7153df 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -251,8 +251,8 @@ static void coda_evict_inode(struct inode *inode)
 	coda_cache_clear_inode(inode);
 }
 
-int coda_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int coda_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
@@ -260,7 +260,8 @@ int coda_getattr(const struct path *path, struct kstat *stat,
 	return err;
 }
 
-int coda_setattr(struct dentry *de, struct iattr *iattr)
+int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+		 struct iattr *iattr)
 {
 	struct inode *inode = d_inode(de);
 	struct coda_vattr vattr;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 3aec27e5eb82..cb9fd59a688c 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,8 @@
 #include "coda_linux.h"
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -40,7 +41,8 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask)
 {
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 22dce2d35a4b..9a3aed249692 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -79,7 +79,8 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
 
 extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
 extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
-extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
+extern int configfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *iattr);
 
 extern struct dentry *configfs_pin_fs(void);
 extern void configfs_release_fs(void);
@@ -92,7 +93,8 @@ extern const struct inode_operations configfs_root_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
 extern const struct dentry_operations configfs_dentry_ops;
 
-extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+extern int configfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    const char *symname);
 extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b839dd1b459f..b6098e02e20b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1268,7 +1268,8 @@ out_root_unlock:
 }
 EXPORT_SYMBOL(configfs_depend_item_unlocked);
 
-static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int ret = 0;
 	int module_got = 0;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 8bd6a883c94c..42c348bb2903 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -40,7 +40,8 @@ static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
 
-int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
+int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *iattr)
 {
 	struct inode * inode = d_inode(dentry);
 	struct configfs_dirent * sd = dentry->d_fsdata;
@@ -67,7 +68,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	}
 	/* attributes were changed atleast once in past */
 
-	error = simple_setattr(dentry, iattr);
+	error = simple_setattr(mnt_userns, dentry, iattr);
 	if (error)
 		return error;
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 8ca36394fa30..77c854364e60 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -139,7 +139,8 @@ static int get_target(const char *symname, struct path *path,
 }
 
 
-int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, const char *symname)
 {
 	int ret;
 	struct path path;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2fcf66473436..c35249497b9b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -42,13 +42,14 @@ static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS;
  * so that we can use the file mode as part of a heuristic to determine whether
  * to lock down individual files.
  */
-static int debugfs_setattr(struct dentry *dentry, struct iattr *ia)
+static int debugfs_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *ia)
 {
 	int ret = security_locked_down(LOCKDOWN_DEBUGFS);
 
 	if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
 		return ret;
-	return simple_setattr(dentry, ia);
+	return simple_setattr(&init_user_ns, dentry, ia);
 }
 
 static const struct inode_operations debugfs_file_inode_operations = {
@@ -775,8 +776,8 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 
 	take_dentry_name_snapshot(&old_name, old_dentry);
 
-	error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
-			      dentry, 0);
+	error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry,
+			      d_inode(new_dir), dentry, 0);
 	if (error) {
 		release_dentry_name_snapshot(&old_name);
 		goto exit;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 73e3d47e7b2d..55da9a91f51a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -257,7 +257,8 @@ out:
  * Returns zero on success; non-zero on error condition
  */
 static int
-ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+ecryptfs_create(struct user_namespace *mnt_userns,
+		struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 		umode_t mode, bool excl)
 {
 	struct inode *ecryptfs_inode;
@@ -463,7 +464,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
 }
 
-static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+static int ecryptfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    const char *symname)
 {
 	int rc;
@@ -502,7 +504,8 @@ out_lock:
 	return rc;
 }
 
-static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int rc;
 	struct dentry *lower_dentry;
@@ -559,7 +562,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 }
 
 static int
-ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	int rc;
 	struct dentry *lower_dentry;
@@ -584,9 +588,9 @@ out:
 }
 
 static int
-ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		struct dentry *old_dentry, struct inode *new_dir,
+		struct dentry *new_dentry, unsigned int flags)
 {
 	int rc;
 	struct dentry *lower_old_dentry;
@@ -874,7 +878,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	return inode_permission(&init_user_ns,
 				ecryptfs_inode_to_lower(inode), mask);
@@ -892,7 +897,8 @@ ecryptfs_permission(struct inode *inode, int mask)
  * All other metadata changes will be passed right to the lower filesystem,
  * and we will just update our inode to look like the lower.
  */
-static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+static int ecryptfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *ia)
 {
 	int rc = 0;
 	struct dentry *lower_dentry;
@@ -979,7 +985,8 @@ out:
 	return rc;
 }
 
-static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+static int ecryptfs_getattr_link(struct user_namespace *mnt_userns,
+				 const struct path *path, struct kstat *stat,
 				 u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
@@ -1004,7 +1011,8 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
 	return rc;
 }
 
-static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+static int ecryptfs_getattr(struct user_namespace *mnt_userns,
+			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 0297ad95eb5c..14e2947975fd 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -66,8 +66,8 @@ bool efivarfs_valid_name(const char *str, int len)
 	return uuid_is_valid(s);
 }
 
-static int efivarfs_create(struct inode *dir, struct dentry *dentry,
-			  umode_t mode, bool excl)
+static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode = NULL;
 	struct efivar_entry *var;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 083818063ac6..119fdce1b520 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -331,8 +331,9 @@ struct inode *erofs_iget(struct super_block *sb,
 	return inode;
 }
 
-int erofs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int query_flags)
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask,
+		  unsigned int query_flags)
 {
 	struct inode *const inode = d_inode(path->dentry);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 67a7ec945686..351dae524a0c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -373,8 +373,9 @@ extern const struct inode_operations erofs_symlink_iops;
 extern const struct inode_operations erofs_fast_symlink_iops;
 
 struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
-int erofs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int query_flags);
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask,
+		  unsigned int query_flags);
 
 /* namei.c */
 extern const struct inode_operations erofs_dir_iops;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index b8f0e829ecbd..d905bb9cd2ca 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -416,9 +416,11 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
 extern const struct file_operations exfat_file_operations;
 int __exfat_truncate(struct inode *inode, loff_t new_size);
 void exfat_truncate(struct inode *inode, loff_t size);
-int exfat_setattr(struct dentry *dentry, struct iattr *attr);
-int exfat_getattr(const struct path *path, struct kstat *stat,
-		unsigned int request_mask, unsigned int query_flags);
+int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
+int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, unsigned int request_mask,
+		  unsigned int query_flags);
 int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 
 /* namei.c */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index e9705b3295d3..3aa6eb4de5e3 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -267,8 +267,9 @@ write_size:
 	mutex_unlock(&sbi->s_lock);
 }
 
-int exfat_getattr(const struct path *path, struct kstat *stat,
-		unsigned int request_mask, unsigned int query_flags)
+int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path,
+		  struct kstat *stat, unsigned int request_mask,
+		  unsigned int query_flags)
 {
 	struct inode *inode = d_backing_inode(path->dentry);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -282,7 +283,8 @@ int exfat_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 2932b23a3b6c..d9e8ec689c55 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -541,8 +541,8 @@ out:
 	return ret;
 }
 
-static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -827,7 +827,8 @@ unlock:
 	return err;
 }
 
-static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -1318,9 +1319,10 @@ out:
 	return ret;
 }
 
-static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+static int exfat_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
 {
 	struct inode *old_inode, *new_inode;
 	struct super_block *sb = old_dir->i_sb;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 9031f7df2d48..b9a9db98e94b 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -216,7 +216,8 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
  * inode->i_mutex: down
  */
 int
-ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct posix_acl *acl, int type)
 {
 	int error;
 	int update_mode = 0;
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0f01c759daac..917db5f6630a 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,8 @@ static inline int ext2_acl_count(size_t size)
 
 /* acl.c */
 extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
-extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
 #else
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 2a4175fbaf5e..3309fb2d327a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -764,8 +764,9 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern int ext2_setattr (struct dentry *, struct iattr *);
-extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
+extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *);
+extern int ext2_getattr (struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3d8acafca8ce..68178b2234bd 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1638,8 +1638,8 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
-int ext2_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1664,7 +1664,8 @@ int ext2_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
+int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index ea980f1e2e99..3367384d344d 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -100,7 +100,9 @@ struct dentry *ext2_get_parent(struct dentry *child)
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+static int ext2_create (struct user_namespace * mnt_userns,
+			struct inode * dir, struct dentry * dentry,
+			umode_t mode, bool excl)
 {
 	struct inode *inode;
 	int err;
@@ -118,7 +120,8 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
 	return ext2_add_nondir(dentry, inode);
 }
 
-static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = ext2_new_inode(dir, mode, NULL);
 	if (IS_ERR(inode))
@@ -131,7 +134,8 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
-static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
 	int err;
@@ -151,8 +155,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int ext2_symlink (struct inode * dir, struct dentry * dentry,
-	const char * symname)
+static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, const char * symname)
 {
 	struct super_block * sb = dir->i_sb;
 	int err = -ENAMETOOLONG;
@@ -225,7 +229,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	return err;
 }
 
-static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ext2_mkdir(struct user_namespace * mnt_userns,
+	struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -315,8 +320,9 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
-			struct inode * new_dir,	struct dentry * new_dentry,
+static int ext2_rename (struct user_namespace * mnt_userns,
+			struct inode * old_dir, struct dentry * old_dentry,
+			struct inode * new_dir, struct dentry * new_dentry,
 			unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 7b0fb66bc04d..059434e0f36c 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -222,7 +222,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct posix_acl *acl, int type)
 {
 	handle_t *handle;
 	int error, credits, retries = 0;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9b63f5416a2f..84b8942a57f2 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,8 @@ static inline int ext4_acl_count(size_t size)
 
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2866d249f3d2..3c750f5e8ebd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2877,11 +2877,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
 
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
-extern int  ext4_setattr(struct dentry *, struct iattr *);
-extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int  ext4_setattr(struct user_namespace *, struct dentry *,
+			 struct iattr *);
+extern int  ext4_getattr(struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
-extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int  ext4_file_getattr(struct user_namespace *, const struct path *,
+			      struct kstat *, u32, unsigned int);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a303d3f8423..ce45535336fa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5319,7 +5319,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
  *
  * Called with inode->i_mutex down.
  */
-int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error, rc = 0;
@@ -5535,8 +5536,8 @@ err_out:
 	return error;
 }
 
-int ext4_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int query_flags)
+int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ext4_inode *raw_inode;
@@ -5575,13 +5576,14 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int ext4_file_getattr(const struct path *path, struct kstat *stat,
+int ext4_file_getattr(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
 		      u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	u64 delalloc_blocks;
 
-	ext4_getattr(path, stat, request_mask, query_flags);
+	ext4_getattr(&init_user_ns, path, stat, request_mask, query_flags);
 
 	/*
 	 * If there is inline data in the inode, the inode will normally not
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cf652ba3e74d..13dff80aedcb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2596,8 +2596,8 @@ static int ext4_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2631,8 +2631,8 @@ retry:
 	return err;
 }
 
-static int ext4_mknod(struct inode *dir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
+static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2665,7 +2665,8 @@ retry:
 	return err;
 }
 
-static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2774,7 +2775,8 @@ out:
 	return err;
 }
 
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -3292,7 +3294,7 @@ out_trace:
 	return retval;
 }
 
-static int ext4_symlink(struct inode *dir,
+static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
@@ -4085,7 +4087,8 @@ end_rename:
 	return retval;
 }
 
-static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int ext4_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6a95bf28f602..a19e86c9adac 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -249,7 +249,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	return error;
 }
 
-int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 124868c13f80..986fd1bc780b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,8 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
+extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+			struct posix_acl *, int);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
 							struct page *);
 #else
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bb11759191dc..c9002b1933f0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3135,9 +3135,10 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
 int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate(struct inode *inode);
-int f2fs_getattr(const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned int flags);
-int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags);
+int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr);
 int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
 void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
 int f2fs_precache_extents(struct inode *inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 44cd0dbdbb5d..8f1e97e7d242 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -783,8 +783,8 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
-int f2fs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int query_flags)
+int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -859,7 +859,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
 #define __setattr_copy setattr_copy
 #endif
 
-int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ad98926bacac..c061a67e43a3 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -314,8 +314,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
 	}
 }
 
-static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-						bool excl)
+static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -637,8 +637,8 @@ static const char *f2fs_get_link(struct dentry *dentry,
 	return link;
 }
 
-static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
-					const char *symname)
+static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -717,7 +717,8 @@ out_free_encrypted_link:
 	return err;
 }
 
-static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -770,8 +771,8 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 	return -ENOTEMPTY;
 }
 
-static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
-				umode_t mode, dev_t rdev)
+static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -874,7 +875,8 @@ out:
 	return err;
 }
 
-static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 
@@ -1247,7 +1249,8 @@ out:
 	return err;
 }
 
-static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int f2fs_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 922a0c6ba46c..02d4d4234956 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -397,9 +397,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
+extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
-extern int fat_getattr(const struct path *path, struct kstat *stat,
+extern int fat_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int flags);
 extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f7e04f533d31..dd73d1b70c55 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -95,7 +95,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 		goto out_unlock_inode;
 
 	/* This MUST be done before doing anything irreversible... */
-	err = fat_setattr(file->f_path.dentry, &ia);
+	err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia);
 	if (err)
 		goto out_unlock_inode;
 
@@ -394,8 +394,8 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
-int fat_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags)
+int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(&init_user_ns, inode, stat);
@@ -466,7 +466,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
-int fat_setattr(struct dentry *dentry, struct iattr *attr)
+int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 9d062886fbc1..a8f3375d9d10 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -261,8 +261,8 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 }
 
 /***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
@@ -339,7 +339,8 @@ out:
 }
 
 /***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
@@ -593,7 +594,8 @@ error_inode:
 }
 
 /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
-static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int msdos_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 0cdd0fb9f742..23936ecf79a5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -756,8 +756,8 @@ error:
 	return ERR_PTR(err);
 }
 
-static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -846,7 +846,8 @@ out:
 	return err;
 }
 
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -892,9 +893,9 @@ out:
 	return err;
 }
 
-static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct buffer_head *dotdot_bh;
 	struct msdos_dir_entry *dotdot_de;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index f529075a2ce8..e9c0f916349d 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -50,7 +50,8 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	const char *name;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d2e318ed9b26..06a18700a845 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -605,7 +605,8 @@ out_err:
 	return err;
 }
 
-static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t, dev_t);
 static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 			    struct file *file, unsigned flags,
 			    umode_t mode)
@@ -645,7 +646,7 @@ out_dput:
 	return err;
 
 mknod:
-	err = fuse_mknod(dir, entry, mode, 0);
+	err = fuse_mknod(&init_user_ns, dir, entry, mode, 0);
 	if (err)
 		goto out_dput;
 no_open:
@@ -715,8 +716,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
 	return err;
 }
 
-static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
-		      dev_t rdev)
+static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *entry, umode_t mode, dev_t rdev)
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_mount *fm = get_fuse_mount(dir);
@@ -738,13 +739,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
 	return create_new_entry(fm, &args, dir, entry, mode);
 }
 
-static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
-		       bool excl)
+static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *entry, umode_t mode, bool excl)
 {
-	return fuse_mknod(dir, entry, mode, 0);
+	return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
 }
 
-static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
+static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *entry, umode_t mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_mount *fm = get_fuse_mount(dir);
@@ -765,8 +767,8 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
 	return create_new_entry(fm, &args, dir, entry, S_IFDIR);
 }
 
-static int fuse_symlink(struct inode *dir, struct dentry *entry,
-			const char *link)
+static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *entry, const char *link)
 {
 	struct fuse_mount *fm = get_fuse_mount(dir);
 	unsigned len = strlen(link) + 1;
@@ -908,9 +910,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
 	return err;
 }
 
-static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
-			struct inode *newdir, struct dentry *newent,
-			unsigned int flags)
+static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
+			struct dentry *oldent, struct inode *newdir,
+			struct dentry *newent, unsigned int flags)
 {
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	int err;
@@ -1249,7 +1251,8 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
@@ -1757,7 +1760,8 @@ error:
 	return err;
 }
 
-static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry,
+			struct iattr *attr)
 {
 	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1819,7 +1823,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	return ret;
 }
 
-static int fuse_getattr(const struct path *path, struct kstat *stat,
+static int fuse_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7c4b8cb93f9f..68cca8d4db6e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1180,8 +1180,8 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
 
 struct posix_acl;
 struct posix_acl *fuse_get_acl(struct inode *inode, int type);
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type);
 
 /* readdir.c */
 int fuse_readdir(struct file *file, struct dir_context *ctx);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ce88ef29eef0..9165d70ead07 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -106,7 +106,8 @@ out:
 	return error;
 }
 
-int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 61353a1501c5..eccc6a43326c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,6 +13,7 @@
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
 extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 1d994bdfffaa..8f5523822788 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 226b5b1dc1fa..cfac2c1e67fa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -325,7 +325,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = gfs2_permission(dir, MAY_EXEC);
+		error = gfs2_permission(&init_user_ns, dir, MAY_EXEC);
 		if (error)
 			goto out;
 	}
@@ -355,7 +355,8 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, &dip->i_inode,
+				MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -843,8 +844,8 @@ fail:
  * Returns: errno
  */
 
-static int gfs2_create(struct inode *dir, struct dentry *dentry,
-		       umode_t mode, bool excl)
+static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl);
 }
@@ -951,7 +952,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		goto out_gunlock;
 
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_gunlock;
 
@@ -1068,7 +1069,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, &dip->i_inode,
+				MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -1204,8 +1206,8 @@ out_inodes:
  * Returns: errno
  */
 
-static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
-			const char *symname)
+static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	unsigned int size;
 
@@ -1225,7 +1227,8 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
  * Returns: errno
  */
 
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir));
 	return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0);
@@ -1240,8 +1243,8 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
  *
  */
 
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      dev_t dev)
+static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0);
 }
@@ -1490,7 +1493,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+		error = gfs2_permission(&init_user_ns, ndir,
+					MAY_WRITE | MAY_EXEC);
 		if (error)
 			goto out_gunlock;
 
@@ -1525,7 +1529,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = gfs2_permission(d_inode(odentry), MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, d_inode(odentry),
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1688,12 +1693,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
 		goto out_gunlock;
 
 	if (S_ISDIR(old_mode)) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, odentry->d_inode,
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
 	if (S_ISDIR(new_mode)) {
-		error = gfs2_permission(ndentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, ndentry->d_inode,
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1747,9 +1754,9 @@ out:
 	return error;
 }
 
-static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
-			struct inode *ndir, struct dentry *ndentry,
-			unsigned int flags)
+static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir,
+			struct dentry *odentry, struct inode *ndir,
+			struct dentry *ndentry, unsigned int flags)
 {
 	flags &= ~RENAME_NOREPLACE;
 
@@ -1833,7 +1840,8 @@ out:
  * Returns: errno
  */
 
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
@@ -1963,7 +1971,8 @@ out:
  * Returns: errno
  */
 
-static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+static int gfs2_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -2008,6 +2017,7 @@ out:
 
 /**
  * gfs2_getattr - Read out an inode's attributes
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @path: Object to query
  * @stat: The inode's stats
  * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
@@ -2022,7 +2032,8 @@ out:
  * Returns: errno
  */
 
-static int gfs2_getattr(const struct path *path, struct kstat *stat,
+static int gfs2_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8073b8d2c7fa..c447bd5b3017 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,8 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 3bf2ae0e467c..527f6e46cbe8 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -189,8 +189,8 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
  * a directory and return a corresponding inode, given the inode for
  * the directory and the name (and its length) of the new file.
  */
-static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	int res;
@@ -219,7 +219,8 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
  * in a directory, given the inode for the parent directory and the
  * name (and its length) of the new directory.
  */
-static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int res;
@@ -279,9 +280,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
  * new file/directory.
  * XXX: how do you handle must_be dir?
  */
-static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	int res;
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index f71c384064c8..b8eb0322a3e5 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -204,7 +204,8 @@ extern const struct address_space_operations hfs_btree_aops;
 extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
 extern int hfs_write_inode(struct inode *, struct writeback_control *);
-extern int hfs_inode_setattr(struct dentry *, struct iattr *);
+extern int hfs_inode_setattr(struct user_namespace *, struct dentry *,
+			     struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
 extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c646218b72bf..3fc5cb346586 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -602,13 +602,15 @@ static int hfs_file_release(struct inode *inode, struct file *file)
  *     correspond to the same HFS file.
  */
 
-int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
+int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(&init_user_ns, dentry, attr); /* basic permission checks */
+	error = setattr_prepare(&init_user_ns, dentry,
+				attr); /* basic permission checks */
 	if (error)
 		return error;
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 29a9dcfbe81f..03e6c046faf4 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -434,8 +434,8 @@ out:
 	return res;
 }
 
-static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
-			   const char *symname)
+static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, const char *symname)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
@@ -476,8 +476,8 @@ out:
 	return res;
 }
 
-static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
-			 umode_t mode, dev_t rdev)
+static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
@@ -517,18 +517,20 @@ out:
 	return res;
 }
 
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			  bool excl)
+static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode, bool excl)
 {
-	return hfsplus_mknod(dir, dentry, mode, 0);
+	return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0);
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
-	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
+	return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 }
 
-static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int hfsplus_rename(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags)
 {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a92de5199ec3..12b20479ed2b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -488,8 +488,9 @@ void hfsplus_inode_write_fork(struct inode *inode,
 			      struct hfsplus_fork_raw *fork);
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
 int hfsplus_cat_write_inode(struct inode *inode);
-int hfsplus_getattr(const struct path *path, struct kstat *stat,
-		    u32 request_mask, unsigned int query_flags);
+int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		    struct kstat *stat, u32 request_mask,
+		    unsigned int query_flags);
 int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 642e067d8fe8..7a937de9b2ad 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -241,7 +241,8 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+static int hfsplus_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
@@ -270,8 +271,9 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-int hfsplus_getattr(const struct path *path, struct kstat *stat,
-		    u32 request_mask, unsigned int query_flags)
+int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		    struct kstat *stat, u32 request_mask,
+		    unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6970e29a5287..7c918cd816a3 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -555,8 +555,8 @@ static int read_name(struct inode *ino, char *name)
 	return 0;
 }
 
-static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			 bool excl)
+static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	char *name;
@@ -654,8 +654,8 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 	return err;
 }
 
-static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
-			  const char *to)
+static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino,
+			  struct dentry *dentry, const char *to)
 {
 	char *file;
 	int err;
@@ -667,7 +667,8 @@ static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
 	return err;
 }
 
-static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino,
+			struct dentry *dentry, umode_t mode)
 {
 	char *file;
 	int err;
@@ -691,7 +692,8 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	return err;
 }
 
-static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	char *name;
@@ -729,7 +731,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int hostfs_rename2(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags)
 {
@@ -757,7 +760,8 @@ static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	return err;
 }
 
-static int hostfs_permission(struct inode *ino, int desired)
+static int hostfs_permission(struct user_namespace *mnt_userns,
+			     struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
@@ -783,7 +787,8 @@ static int hostfs_permission(struct inode *ino, int desired)
 	return err;
 }
 
-static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hostfs_setattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hostfs_iattr attrs;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1cca83218fb5..167ec6884642 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *);
 void hpfs_read_inode(struct inode *);
 void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
-int hpfs_setattr(struct dentry *, struct iattr *);
+int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
 void hpfs_evict_inode(struct inode *);
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 8ba2152a78ba..82208cc28ebd 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -257,7 +257,8 @@ void hpfs_write_inode_nolock(struct inode *i)
 	brelse(bh);
 }
 
-int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
+int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error = -EINVAL;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1aee39160ac5..d73f8a67168e 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -20,7 +20,8 @@ static void hpfs_update_directory_times(struct inode *dir)
 	hpfs_write_inode_nolock(dir);
 }
 
-static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -128,7 +129,8 @@ bail:
 	return err;
 }
 
-static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -215,7 +217,8 @@ bail:
 	return err;
 }
 
-static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -289,7 +292,8 @@ bail:
 	return err;
 }
 
-static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
+static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symlink)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -506,10 +510,10 @@ fail:
 const struct address_space_operations hpfs_symlink_aops = {
 	.readpage	= hpfs_symlink_readpage
 };
-	
-static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+
+static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	const unsigned char *old_name = old_dentry->d_name.name;
 	unsigned old_len = old_dentry->d_name.len;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 327e572b4e00..c5c32eb59498 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -751,7 +751,8 @@ out:
 	return error;
 }
 
-static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hstate *h = hstate_inode(inode);
@@ -898,33 +899,39 @@ static int do_hugetlbfs_mknod(struct inode *dir,
 	return error;
 }
 
-static int hugetlbfs_mknod(struct inode *dir,
-			struct dentry *dentry, umode_t mode, dev_t dev)
+static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
 }
 
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode)
 {
-	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
+				     mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
 	return retval;
 }
 
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hugetlbfs_create(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode, bool excl)
 {
-	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+	return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
-static int hugetlbfs_tmpfile(struct inode *dir,
-			struct dentry *dentry, umode_t mode)
+static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     umode_t mode)
 {
 	return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
 }
 
-static int hugetlbfs_symlink(struct inode *dir,
-			struct dentry *dentry, const char *symname)
+static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 5f27ac593479..55a79df70d24 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -226,7 +226,8 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 	return rc;
 }
 
-int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type)
 {
 	int rc, xprefix;
 
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 12d0271bdde3..62c50da9d493 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,8 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
-int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 776493713153..c0aabbcbfd58 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -24,18 +24,21 @@
 
 static int jffs2_readdir (struct file *, struct dir_context *);
 
-static int jffs2_create (struct inode *,struct dentry *,umode_t,
-			 bool);
+static int jffs2_create (struct user_namespace *, struct inode *,
+		         struct dentry *, umode_t, bool);
 static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
 				    unsigned int);
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
-static int jffs2_symlink (struct inode *,struct dentry *,const char *);
-static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
+static int jffs2_symlink (struct user_namespace *, struct inode *,
+			  struct dentry *, const char *);
+static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *,
+			umode_t);
 static int jffs2_rmdir (struct inode *,struct dentry *);
-static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t);
-static int jffs2_rename (struct inode *, struct dentry *,
-			 struct inode *, struct dentry *,
+static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *,
+			umode_t,dev_t);
+static int jffs2_rename (struct user_namespace *, struct inode *,
+			 struct dentry *, struct inode *, struct dentry *,
 			 unsigned int);
 
 const struct file_operations jffs2_dir_operations =
@@ -157,8 +160,8 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 /***********************************************************************/
 
 
-static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
-			umode_t mode, bool excl)
+static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct jffs2_raw_inode *ri;
 	struct jffs2_inode_info *f, *dir_f;
@@ -276,7 +279,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 
 /***********************************************************************/
 
-static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target)
+static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i,
+			  struct dentry *dentry, const char *target)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -438,7 +442,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 }
 
 
-static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode)
+static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i,
+		        struct dentry *dentry, umode_t mode)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -609,7 +614,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	return ret;
 }
 
-static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i,
+		        struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -756,7 +762,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
 	return ret;
 }
 
-static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
+static int jffs2_rename (struct user_namespace *mnt_userns,
+			 struct inode *old_dir_i, struct dentry *old_dentry,
 			 struct inode *new_dir_i, struct dentry *new_dentry,
 			 unsigned int flags)
 {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ee9f51bab4c6..2ac410477c4f 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,7 +190,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	return 0;
 }
 
-int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
+int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int rc;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index ef1cfa61549e..173eccac691d 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long);
 extern const struct inode_operations jffs2_symlink_inode_operations;
 
 /* fs.c */
-int jffs2_setattr (struct dentry *, struct iattr *);
+int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
 void jffs2_evict_inode (struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index cf79a34bfada..43c285c3d2a7 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -91,7 +91,8 @@ out:
 	return rc;
 }
 
-int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		struct posix_acl *acl, int type)
 {
 	int rc;
 	tid_t tid;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 61c3b0c1fbf6..28b70e7c7dd4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -85,7 +85,8 @@ static int jfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int rc;
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 9f8f92dd6f84..7ae389a7a366 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,8 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 
 struct posix_acl *jfs_get_acl(struct inode *inode, int type);
-int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		struct posix_acl *acl, int type);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 
 #else
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 70a0d12e427e..01daa0cb0ae5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 	int fh_len, int fh_type);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern int jfs_setattr(struct dentry *, struct iattr *);
+extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 extern const struct address_space_operations jfs_aops;
 extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 7a55d14cc1af..9abed0d750e5 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -59,8 +59,8 @@ static inline void free_ea_wmap(struct inode *inode)
  * RETURN:	Errors from subroutines
  *
  */
-static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
@@ -192,7 +192,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
  * note:
  * EACCES: user needs search+write permission on the parent directory
  */
-static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip,
+		     struct dentry *dentry, umode_t mode)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
@@ -868,8 +869,8 @@ static int jfs_link(struct dentry *old_dentry,
  * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
 */
 
-static int jfs_symlink(struct inode *dip, struct dentry *dentry,
-		const char *name)
+static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
+		       struct dentry *dentry, const char *name)
 {
 	int rc;
 	tid_t tid;
@@ -1058,9 +1059,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
  *
  * FUNCTION:	rename a file or directory
  */
-static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct btstack btstack;
 	ino_t ino;
@@ -1344,8 +1345,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
  *
  * FUNCTION:	Create a special file (device)
  */
-static int jfs_mknod(struct inode *dir, struct dentry *dentry,
-		umode_t mode, dev_t rdev)
+static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct jfs_inode_info *jfs_ip;
 	struct btstack btstack;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7a53eed69fef..7e0e62deab53 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1110,7 +1110,8 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 	return ret;
 }
 
-static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+static int kernfs_iop_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    umode_t mode)
 {
 	struct kernfs_node *parent = dir->i_private;
@@ -1147,7 +1148,8 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int kernfs_iop_rename(struct user_namespace *mnt_userns,
+			     struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry,
 			     unsigned int flags)
 {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 032d3d7546d8..d73950fc3d57 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -112,7 +112,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	return ret;
 }
 
-int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct kernfs_node *kn = inode->i_private;
@@ -183,7 +184,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
-int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+int kernfs_iop_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -272,7 +274,8 @@ void kernfs_evict_inode(struct inode *inode)
 	kernfs_put(kn);
 }
 
-int kernfs_iop_permission(struct inode *inode, int mask)
+int kernfs_iop_permission(struct user_namespace *mnt_userns,
+			  struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 7ee97ef59184..ccc3b44f6306 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -89,9 +89,12 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
  */
 extern const struct xattr_handler *kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
-int kernfs_iop_permission(struct inode *inode, int mask);
-int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
-int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+int kernfs_iop_permission(struct user_namespace *mnt_userns,
+			  struct inode *inode, int mask);
+int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *iattr);
+int kernfs_iop_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
diff --git a/fs/libfs.c b/fs/libfs.c
index 508e9ea8e6f3..967aefda6ee3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -27,8 +27,9 @@
 
 #include "internal.h"
 
-int simple_getattr(const struct path *path, struct kstat *stat,
-		   u32 request_mask, unsigned int query_flags)
+int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *stat, u32 request_mask,
+		   unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(&init_user_ns, inode, stat);
@@ -447,9 +448,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL(simple_rmdir);
 
-int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
-		  struct inode *new_dir, struct dentry *new_dentry,
-		  unsigned int flags)
+int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		  struct dentry *old_dentry, struct inode *new_dir,
+		  struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *inode = d_inode(old_dentry);
 	int they_are_dirs = d_is_dir(old_dentry);
@@ -492,18 +493,19 @@ EXPORT_SYMBOL(simple_rename);
  * on simple regular filesystems.  Anything that needs to change on-disk
  * or wire state on size changes needs its own setattr method.
  */
-int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(&init_user_ns, dentry, iattr);
+	error = setattr_prepare(mnt_userns, dentry, iattr);
 	if (error)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
 		truncate_setsize(inode, iattr->ia_size);
-	setattr_copy(&init_user_ns, inode, iattr);
+	setattr_copy(mnt_userns, inode, iattr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -1300,7 +1302,8 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
 	return ERR_PTR(-ENOENT);
 }
 
-static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+static int empty_dir_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -1308,7 +1311,8 @@ static int empty_dir_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+static int empty_dir_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, struct iattr *attr)
 {
 	return -EPERM;
 }
@@ -1318,14 +1322,9 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
 	return -EOPNOTSUPP;
 }
 
-static int empty_dir_permission(struct inode *inode, int mask)
-{
-	return generic_permission(&init_user_ns, inode, mask);
-}
-
 static const struct inode_operations empty_dir_inode_operations = {
 	.lookup		= empty_dir_lookup,
-	.permission	= empty_dir_permission,
+	.permission	= generic_permission,
 	.setattr	= empty_dir_setattr,
 	.getattr	= empty_dir_getattr,
 	.listxattr	= empty_dir_listxattr,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f07acd268577..6a7bd2d9eec0 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -22,7 +22,8 @@ const struct file_operations minix_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
-static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+static int minix_setattr(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 91c81d2fc90d..a532a99bbe81 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -652,8 +652,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-int minix_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int minix_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct super_block *sb = path->dentry->d_sb;
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 168d45d3de73..202173368025 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -51,7 +51,8 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct super_block *sb);
-extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int minix_getattr(struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 extern void V1_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1a6084d2b02e..937fa5fae2b8 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -33,7 +33,8 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
 	return d_splice_alias(inode, dentry);
 }
 
-static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	int error;
 	struct inode *inode;
@@ -51,7 +52,8 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
 	return error;
 }
 
-static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	int error;
 	struct inode *inode = minix_new_inode(dir, mode, &error);
@@ -63,14 +65,14 @@ static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return error;
 }
 
-static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int minix_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return minix_mknod(dir, dentry, mode, 0);
+	return minix_mknod(mnt_userns, dir, dentry, mode, 0);
 }
 
-static int minix_symlink(struct inode * dir, struct dentry *dentry,
-	  const char * symname)
+static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	int err = -ENAMETOOLONG;
 	int i = strlen(symname)+1;
@@ -109,7 +111,8 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -181,8 +184,9 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
-			struct inode * new_dir, struct dentry *new_dentry,
+static int minix_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
diff --git a/fs/namei.c b/fs/namei.c
index c8c083daf368..d9ceb75ac169 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -443,7 +443,7 @@ static inline int do_inode_permission(struct user_namespace *mnt_userns,
 {
 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 		if (likely(inode->i_op->permission))
-			return inode->i_op->permission(inode, mask);
+			return inode->i_op->permission(mnt_userns, inode, mask);
 
 		/* This gets set once for the inode lifetime */
 		spin_lock(&inode->i_lock);
@@ -2199,11 +2199,13 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
+		struct user_namespace *mnt_userns;
 		const char *link;
 		u64 hash_len;
 		int type;
 
-		err = may_lookup(&init_user_ns, nd);
+		mnt_userns = mnt_user_ns(nd->path.mnt);
+		err = may_lookup(mnt_userns, nd);
 		if (err)
 			return err;
 
@@ -2251,7 +2253,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
-				nd->dir_uid = i_uid_into_mnt(&init_user_ns, nd->inode);
+				nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
 				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
@@ -2904,7 +2906,7 @@ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	error = dir->i_op->create(dir, dentry, mode, want_excl);
+	error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -2995,7 +2997,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path,
 	return 0;
 }
 
-static int handle_truncate(struct file *filp)
+static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
 {
 	const struct path *path = &filp->f_path;
 	struct inode *inode = path->dentry->d_inode;
@@ -3009,7 +3011,7 @@ static int handle_truncate(struct file *filp)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error) {
-		error = do_truncate(&init_user_ns, path->dentry, 0,
+		error = do_truncate(mnt_userns, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
@@ -3118,6 +3120,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 				  const struct open_flags *op,
 				  bool got_write)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *dir = nd->path.dentry;
 	struct inode *dir_inode = dir->d_inode;
 	int open_flag = op->open_flag;
@@ -3165,13 +3168,14 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	 */
 	if (unlikely(!got_write))
 		open_flag &= ~O_TRUNC;
+	mnt_userns = mnt_user_ns(nd->path.mnt);
 	if (open_flag & O_CREAT) {
 		if (open_flag & O_EXCL)
 			open_flag &= ~O_TRUNC;
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
 		if (likely(got_write))
-			create_error = may_o_create(&init_user_ns, &nd->path,
+			create_error = may_o_create(mnt_userns, &nd->path,
 						    dentry, mode);
 		else
 			create_error = -EROFS;
@@ -3207,8 +3211,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 			error = -EACCES;
 			goto out_dput;
 		}
-		error = dir_inode->i_op->create(dir_inode, dentry, mode,
-						open_flag & O_EXCL);
+
+		error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
+						mode, open_flag & O_EXCL);
 		if (error)
 			goto out_dput;
 	}
@@ -3316,6 +3321,7 @@ finish_lookup:
 static int do_open(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
+	struct user_namespace *mnt_userns;
 	int open_flag = op->open_flag;
 	bool do_truncate;
 	int acc_mode;
@@ -3328,12 +3334,13 @@ static int do_open(struct nameidata *nd,
 	}
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
+	mnt_userns = mnt_user_ns(nd->path.mnt);
 	if (open_flag & O_CREAT) {
 		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
 			return -EEXIST;
 		if (d_is_dir(nd->path.dentry))
 			return -EISDIR;
-		error = may_create_in_sticky(&init_user_ns, nd,
+		error = may_create_in_sticky(mnt_userns, nd,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
 			return error;
@@ -3353,13 +3360,13 @@ static int do_open(struct nameidata *nd,
 			return error;
 		do_truncate = true;
 	}
-	error = may_open(&init_user_ns, &nd->path, acc_mode, open_flag);
+	error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
 		error = ima_file_check(file, op->acc_mode);
 	if (!error && do_truncate)
-		error = handle_truncate(file);
+		error = handle_truncate(mnt_userns, file);
 	if (unlikely(error > 0)) {
 		WARN_ON(1);
 		error = -EINVAL;
@@ -3403,7 +3410,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
 	child = d_alloc(dentry, &slash_name);
 	if (unlikely(!child))
 		goto out_err;
-	error = dir->i_op->tmpfile(dir, child, mode);
+	error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
 	if (error)
 		goto out_err;
 	error = -ENOENT;
@@ -3446,7 +3453,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	path.dentry = child;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&init_user_ns, &path, 0, op->open_flag);
+	error = may_open(mnt_userns, &path, 0, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
@@ -3690,7 +3697,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (error)
 		return error;
 
-	error = dir->i_op->mknod(dir, dentry, mode, dev);
+	error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -3809,7 +3816,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (max_links && dir->i_nlink >= max_links)
 		return -EMLINK;
 
-	error = dir->i_op->mkdir(dir, dentry, mode);
+	error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
 	return error;
@@ -3834,7 +3841,8 @@ retry:
 	if (!error) {
 		struct user_namespace *mnt_userns;
 		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, mode);
+		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
+				  mode);
 	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
@@ -4087,7 +4095,8 @@ retry_deleg:
 		if (error)
 			goto exit2;
 		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode);
+		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
+				   &delegated_inode);
 exit2:
 		dput(dentry);
 	}
@@ -4166,7 +4175,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (error)
 		return error;
 
-	error = dir->i_op->symlink(dir, dentry, oldname);
+	error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -4357,13 +4366,13 @@ retry:
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&init_user_ns, &old_path);
+	mnt_userns = mnt_user_ns(new_path.mnt);
+	error = may_linkat(mnt_userns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	mnt_userns = mnt_user_ns(new_path.mnt);
 	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
 			 new_dentry, &delegated_inode);
 out_dput:
@@ -4542,8 +4551,8 @@ int vfs_rename(struct renamedata *rd)
 		if (error)
 			goto out;
 	}
-	error = old_dir->i_op->rename(old_dir, old_dentry,
-				       new_dir, new_dentry, flags);
+	error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
+				      new_dir, new_dentry, flags);
 	if (error)
 		goto out;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 727e01a84503..19a9f434442f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2095,8 +2095,8 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
  * that the operation succeeded on the server, but an error in the
  * reply path made it appear to have failed.
  */
-int nfs_create(struct inode *dir, struct dentry *dentry,
-		umode_t mode, bool excl)
+int nfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct iattr attr;
 	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
@@ -2124,7 +2124,8 @@ EXPORT_SYMBOL_GPL(nfs_create);
  * See comments for nfs_proc_create regarding failed operations.
  */
 int
-nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	  struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct iattr attr;
 	int status;
@@ -2150,7 +2151,8 @@ EXPORT_SYMBOL_GPL(nfs_mknod);
 /*
  * See comments for nfs_proc_create regarding failed operations.
  */
-int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
 	struct iattr attr;
 	int error;
@@ -2295,7 +2297,8 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
  * now have a new file handle and can instantiate an in-core NFS inode
  * and move the raw page into its mapping.
  */
-int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, const char *symname)
 {
 	struct page *page;
 	char *kaddr;
@@ -2398,9 +2401,9 @@ EXPORT_SYMBOL_GPL(nfs_link);
  * If these conditions are met, we can drop the dentries before doing
  * the rename.
  */
-int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-	       struct inode *new_dir, struct dentry *new_dentry,
-	       unsigned int flags)
+int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+	       struct dentry *old_dentry, struct inode *new_dir,
+	       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
@@ -2939,7 +2942,9 @@ static int nfs_execute_ok(struct inode *inode, int mask)
 	return ret;
 }
 
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct user_namespace *mnt_userns,
+		   struct inode *inode,
+		   int mask)
 {
 	const struct cred *cred = current_cred();
 	int res = 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cab123ec1664..447e95974386 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -594,7 +594,8 @@ EXPORT_SYMBOL_GPL(nfs_fhget);
 #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
 
 int
-nfs_setattr(struct dentry *dentry, struct iattr *attr)
+nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	    struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct nfs_fattr *fattr;
@@ -787,8 +788,8 @@ static bool nfs_need_revalidate_inode(struct inode *inode)
 	return false;
 }
 
-int nfs_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct nfs_server *server = NFS_SERVER(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 62d3189745cd..25fb43b69e5a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -378,14 +378,18 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
 extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc);
 struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
-int nfs_create(struct inode *, struct dentry *, umode_t, bool);
-int nfs_mkdir(struct inode *, struct dentry *, umode_t);
+int nfs_create(struct user_namespace *, struct inode *, struct dentry *,
+	       umode_t, bool);
+int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *,
+	      umode_t);
 int nfs_rmdir(struct inode *, struct dentry *);
 int nfs_unlink(struct inode *, struct dentry *);
-int nfs_symlink(struct inode *, struct dentry *, const char *);
+int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *,
+		const char *);
 int nfs_link(struct dentry *, struct inode *, struct dentry *);
-int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-int nfs_rename(struct inode *, struct dentry *,
+int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t,
+	      dev_t);
+int nfs_rename(struct user_namespace *, struct inode *, struct dentry *,
 	       struct inode *, struct dentry *, unsigned int);
 
 /* file.c */
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 55fc711e368b..93e60e921f92 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -208,20 +208,23 @@ out_fc:
 }
 
 static int
-nfs_namespace_getattr(const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned int query_flags)
+nfs_namespace_getattr(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int query_flags)
 {
 	if (NFS_FH(d_inode(path->dentry))->size != 0)
-		return nfs_getattr(path, stat, request_mask, query_flags);
+		return nfs_getattr(mnt_userns, path, stat, request_mask,
+				   query_flags);
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return 0;
 }
 
 static int
-nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      struct iattr *attr)
 {
 	if (NFS_FH(d_inode(dentry))->size != 0)
-		return nfs_setattr(dentry, attr);
+		return nfs_setattr(mnt_userns, dentry, attr);
 	return -EACCES;
 }
 
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 1b950b66b3bb..c8a192802dda 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,8 @@
  */
 #ifdef CONFIG_NFS_V3_ACL
 extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
-extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		struct posix_acl *dfacl);
 extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index c6c863382f37..5604e807fc01 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -251,7 +251,8 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 }
 
-int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
 	int status;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8aad3c48092a..2e8eb263cf0f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -805,7 +805,8 @@ void nilfs_evict_inode(struct inode *inode)
 	 */
 }
 
-int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *iattr)
 {
 	struct nilfs_transaction_info ti;
 	struct inode *inode = d_inode(dentry);
@@ -843,7 +844,8 @@ out_err:
 	return err;
 }
 
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a6ec7961d4f5..ecace5f96a95 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -72,8 +72,8 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -100,7 +100,8 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 
 static int
-nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -124,8 +125,8 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	return err;
 }
 
-static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = dir->i_sb;
@@ -201,7 +202,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
-static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -338,8 +340,9 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return err;
 }
 
-static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir,	struct dentry *new_dentry,
+static int nilfs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f8450ee3fd06..c4a45a081ade 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -267,9 +267,11 @@ extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
 extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
-extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_setattr(struct user_namespace *, struct dentry *,
+			 struct iattr *);
 extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 38f4cf1d4497..4435dbbc0b63 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2848,6 +2848,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
 
 /**
  * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	dentry whose attributes to change
  * @attr:	structure describing the attributes and the changes
  *
@@ -2860,7 +2861,8 @@ void ntfs_truncate_vfs(struct inode *vi) {
  *
  * Called with ->i_mutex held.
  */
-int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
+int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *vi = d_inode(dentry);
 	int err;
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 363e4e820673..6f78ee00f57f 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -289,7 +289,8 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
 extern int ntfs_truncate(struct inode *vi);
 extern void ntfs_truncate_vfs(struct inode *vi);
 
-extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ntfs_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr);
 
 extern int __ntfs_write_inode(struct inode *vi, int sync);
 
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 990756cee4bd..5259badabb56 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -262,7 +262,8 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct posix_acl *acl, int type)
 {
 	struct buffer_head *bh = NULL;
 	int status, had_lock;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 127b13432146..4e86450917b2 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -19,7 +19,8 @@ struct ocfs2_acl_entry {
 };
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct posix_acl *acl, int type);
 extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9fa66cd1f622..b2870f1a31df 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -190,7 +190,8 @@ static int dlmfs_file_release(struct inode *inode,
  * We do ->setattr() just to override size changes.  Our size is the size
  * of the LVB and nothing else.
  */
-static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+static int dlmfs_file_setattr(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct iattr *attr)
 {
 	int error;
 	struct inode *inode = d_inode(dentry);
@@ -395,7 +396,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
  * File creation. Allocate an inode, and we're done..
  */
 /* SMP-safe */
-static int dlmfs_mkdir(struct inode * dir,
+static int dlmfs_mkdir(struct user_namespace * mnt_userns,
+		       struct inode * dir,
 		       struct dentry * dentry,
 		       umode_t mode)
 {
@@ -443,7 +445,8 @@ bail:
 	return status;
 }
 
-static int dlmfs_create(struct inode *dir,
+static int dlmfs_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool excl)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a070d4c9b6ed..e3039d973acd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1112,7 +1112,8 @@ out:
 	return ret;
 }
 
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	int status = 0, size_change;
 	int inode_locked = 0;
@@ -1298,8 +1299,8 @@ bail:
 	return status;
 }
 
-int ocfs2_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct super_block *sb = path->dentry->d_sb;
@@ -1330,7 +1331,8 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask)
 {
 	int ret, had_lock;
 	struct ocfs2_lock_holder oh;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 4832cbceba5b..8536cec5f122 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,10 +51,13 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
 			  u64 new_i_size, u64 zero_to);
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 		      loff_t zero_to);
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-int ocfs2_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
+int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags);
+int ocfs2_permission(struct user_namespace *mnt_userns,
+		     struct inode *inode,
+		     int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 908b79e1082b..3abdd36da2e2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -221,7 +221,8 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
 	iput(inode);
 }
 
-static int ocfs2_mknod(struct inode *dir,
+static int ocfs2_mknod(struct user_namespace *mnt_userns,
+		       struct inode *dir,
 		       struct dentry *dentry,
 		       umode_t mode,
 		       dev_t dev)
@@ -645,7 +646,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	return status;
 }
 
-static int ocfs2_mkdir(struct inode *dir,
+static int ocfs2_mkdir(struct user_namespace *mnt_userns,
+		       struct inode *dir,
 		       struct dentry *dentry,
 		       umode_t mode)
 {
@@ -653,14 +655,15 @@ static int ocfs2_mkdir(struct inode *dir,
 
 	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			  OCFS2_I(dir)->ip_blkno, mode);
-	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 	if (ret)
 		mlog_errno(ret);
 
 	return ret;
 }
 
-static int ocfs2_create(struct inode *dir,
+static int ocfs2_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool excl)
@@ -669,7 +672,7 @@ static int ocfs2_create(struct inode *dir,
 
 	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
-	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 	if (ret)
 		mlog_errno(ret);
 
@@ -1195,7 +1198,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 		ocfs2_inode_unlock(inode2, 1);
 }
 
-static int ocfs2_rename(struct inode *old_dir,
+static int ocfs2_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
 			struct dentry *new_dentry,
@@ -1784,7 +1788,8 @@ bail:
 	return status;
 }
 
-static int ocfs2_symlink(struct inode *dir,
+static int ocfs2_symlink(struct user_namespace *mnt_userns,
+			 struct inode *dir,
 			 struct dentry *dentry,
 			 const char *symname)
 {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index a0f45651f3b7..c219f91f44e9 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,13 +279,14 @@ out_free_inode:
 	return err;
 }
 
-static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	return omfs_add_node(dir, dentry, mode | S_IFDIR);
 }
 
-static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	return omfs_add_node(dir, dentry, mode | S_IFREG);
 }
@@ -369,9 +370,9 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
 	return true;
 }
 
-static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *new_inode = d_inode(new_dentry);
 	struct inode *old_inode = d_inode(old_dentry);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 729339cd7902..11e733aab25d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -343,7 +343,8 @@ const struct file_operations omfs_file_operations = {
 	.splice_read = generic_file_splice_read,
 };
 
-static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int omfs_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 628921952d16..18852b9ed82b 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -116,7 +116,8 @@ out:
 	return error;
 }
 
-int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		     struct posix_acl *acl, int type)
 {
 	int error;
 	struct iattr iattr;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index b94032f77e61..5079cfafa8d7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -871,7 +871,8 @@ out:
 /*
  * Change attributes of an object referenced by dentry.
  */
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *iattr)
 {
 	int ret;
 	gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
@@ -890,8 +891,8 @@ out:
 /*
  * Obtain attributes of an object given a dentry
  */
-int orangefs_getattr(const struct path *path, struct kstat *stat,
-		     u32 request_mask, unsigned int flags)
+int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		     struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	int ret;
 	struct inode *inode = path->dentry->d_inode;
@@ -919,7 +920,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 	return ret;
 }
 
-int orangefs_permission(struct inode *inode, int mask)
+int orangefs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask)
 {
 	int ret;
 
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 3e7cf3d0a494..600e8eee541f 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -15,7 +15,8 @@
 /*
  * Get a newly allocated inode to go with a negative dentry.
  */
-static int orangefs_create(struct inode *dir,
+static int orangefs_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool exclusive)
@@ -215,7 +216,8 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-static int orangefs_symlink(struct inode *dir,
+static int orangefs_symlink(struct user_namespace *mnt_userns,
+		         struct inode *dir,
 			 struct dentry *dentry,
 			 const char *symname)
 {
@@ -303,7 +305,8 @@ out:
 	return ret;
 }
 
-static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
@@ -372,7 +375,8 @@ out:
 	return ret;
 }
 
-static int orangefs_rename(struct inode *old_dir,
+static int orangefs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
 			struct dentry *new_dentry,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index e12aeb9623d6..0e6b97682e41 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -107,7 +107,9 @@ extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
 extern const struct xattr_handler *orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
-extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int orangefs_set_acl(struct user_namespace *mnt_userns,
+			    struct inode *inode, struct posix_acl *acl,
+			    int type);
 
 /*
  * orangefs data structures
@@ -359,12 +361,13 @@ struct inode *orangefs_new_inode(struct super_block *sb,
 			      struct orangefs_object_kref *ref);
 
 int __orangefs_setattr(struct inode *, struct iattr *);
-int orangefs_setattr(struct dentry *, struct iattr *);
+int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
-int orangefs_getattr(const struct path *path, struct kstat *stat,
-		     u32 request_mask, unsigned int flags);
+int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		     struct kstat *stat, u32 request_mask, unsigned int flags);
 
-int orangefs_permission(struct inode *inode, int mask);
+int orangefs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask);
 
 int orangefs_update_time(struct inode *, struct timespec64 *, int);
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6904cc2ed7bb..8b3be7342a8c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -650,19 +650,20 @@ out:
 	return err;
 }
 
-static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
 }
 
-static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
 }
 
-static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		     dev_t rdev)
+static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	/* Don't allow creation of "whiteout" on overlay */
 	if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
@@ -671,8 +672,8 @@ static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return ovl_create_object(dentry, mode, rdev, NULL);
 }
 
-static int ovl_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *link)
+static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *link)
 {
 	return ovl_create_object(dentry, S_IFLNK, 0, link);
 }
@@ -1069,9 +1070,9 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
 	return err;
 }
 
-static int ovl_rename(struct inode *olddir, struct dentry *old,
-		      struct inode *newdir, struct dentry *new,
-		      unsigned int flags)
+static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
+		      struct dentry *old, struct inode *newdir,
+		      struct dentry *new, unsigned int flags)
 {
 	int err;
 	struct dentry *old_upperdir;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 023fde466e3a..e78d45dfeaee 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,7 +14,8 @@
 #include "overlayfs.h"
 
 
-int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	int err;
 	bool full_copy_up = false;
@@ -154,8 +155,8 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
 	return 0;
 }
 
-int ovl_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags)
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	enum ovl_path_type type;
@@ -277,7 +278,8 @@ out:
 	return err;
 }
 
-int ovl_permission(struct inode *inode, int mask)
+int ovl_permission(struct user_namespace *mnt_userns,
+		   struct inode *inode, int mask)
 {
 	struct inode *upperinode = ovl_inode_upper(inode);
 	struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5e9eb46e741a..78b9d93a33c9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -444,10 +444,12 @@ int ovl_set_nlink_lower(struct dentry *dentry);
 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
 			   struct dentry *upperdentry,
 			   unsigned int fallback);
-int ovl_setattr(struct dentry *dentry, struct iattr *attr);
-int ovl_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags);
-int ovl_permission(struct inode *inode, int mask);
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr);
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags);
+int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		   int mask);
 int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 		  const void *value, size_t size, int flags);
 int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8168ab2dda11..c04612b19054 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1023,7 +1023,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
 		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
 
-		err = ovl_setattr(dentry, &iattr);
+		err = ovl_setattr(&init_user_ns, dentry, &iattr);
 		if (err)
 			return err;
 	}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index d31b60f5d40d..f3309a7edb49 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -593,7 +593,7 @@ int
 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (ret)
 		return ret;
-	ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+	ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -918,7 +918,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
 		if (ret)
 			return ret;
 	}
-	return inode->i_op->set_acl(inode, acl, type);
+	return inode->i_op->set_acl(mnt_userns, inode, acl, type);
 }
 EXPORT_SYMBOL(set_posix_acl);
 
@@ -966,12 +966,13 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
 };
 EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
 
-int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		   struct posix_acl *acl, int type)
 {
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(&init_user_ns, inode,
+		error = posix_acl_update_mode(mnt_userns, inode,
 				&inode->i_mode, &acl);
 		if (error)
 			return error;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d45aa68c1f17..56bf14316122 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -685,7 +685,8 @@ static int proc_fd_access_allowed(struct inode *inode)
 	return allowed;
 }
 
-int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	int error;
 	struct inode *inode = d_inode(dentry);
@@ -726,7 +727,8 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info,
 }
 
 
-static int proc_pid_permission(struct inode *inode, int mask)
+static int proc_pid_permission(struct user_namespace *mnt_userns,
+			       struct inode *inode, int mask)
 {
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 	struct task_struct *task;
@@ -1927,8 +1929,8 @@ out_unlock:
 	return NULL;
 }
 
-int pid_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
@@ -3473,7 +3475,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
  * This function makes sure that the node is always accessible for members of
  * same thread group.
  */
-static int proc_tid_comm_permission(struct inode *inode, int mask)
+static int proc_tid_comm_permission(struct user_namespace *mnt_userns,
+				    struct inode *inode, int mask)
 {
 	bool is_same_tgroup;
 	struct task_struct *task;
@@ -3798,7 +3801,8 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-static int proc_task_getattr(const struct path *path, struct kstat *stat,
+static int proc_task_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d6e76461e135..07fc4fad2602 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -276,7 +276,8 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-int proc_fd_permission(struct inode *inode, int mask)
+int proc_fd_permission(struct user_namespace *mnt_userns,
+		       struct inode *inode, int mask)
 {
 	struct task_struct *p;
 	int rv;
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index f371a602bf58..c5a921a06a0b 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -10,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations;
 extern const struct file_operations proc_fdinfo_operations;
 extern const struct inode_operations proc_fdinfo_inode_operations;
 
-extern int proc_fd_permission(struct inode *inode, int mask);
+extern int proc_fd_permission(struct user_namespace *mnt_userns,
+			      struct inode *inode, int mask);
 
 static inline unsigned int proc_fd(struct inode *inode)
 {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0db96a761149..bc86aa87cc41 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -115,7 +115,8 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
 	return true;
 }
 
-static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+static int proc_notify_change(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct proc_dir_entry *de = PDE(inode);
@@ -133,7 +134,8 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	return 0;
 }
 
-static int proc_getattr(const struct path *path, struct kstat *stat,
+static int proc_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f60b379dcdc7..03415f3fb3a8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,8 +162,10 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
  * base.c
  */
 extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int proc_setattr(struct dentry *, struct iattr *);
+extern int pid_getattr(struct user_namespace *, const struct path *,
+		       struct kstat *, u32, unsigned int);
+extern int proc_setattr(struct user_namespace *, struct dentry *,
+			struct iattr *);
 extern void proc_pid_evict_inode(struct proc_inode *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4aef49ccf571..15c2e55d2ed2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -289,7 +289,8 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
 	return de;
 }
 
-static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+static int proc_tgid_net_getattr(struct user_namespace *mnt_userns,
+				 const struct path *path, struct kstat *stat,
 				 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 87c828348140..2daac06727d0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -785,7 +785,8 @@ out:
 	return 0;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct user_namespace *mnt_userns,
+			       struct inode *inode, int mask)
 {
 	/*
 	 * sysctl entries that are not writeable,
@@ -813,7 +814,8 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	return error;
 }
 
-static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+static int proc_sys_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
@@ -830,7 +832,8 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+static int proc_sys_getattr(struct user_namespace *mnt_userns,
+			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 244e4b6f15ef..c7e3b1350ef8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -308,7 +308,8 @@ void __init proc_root_init(void)
 	register_filesystem(&proc_fs_type);
 }
 
-static int proc_root_getattr(const struct path *path, struct kstat *stat,
+static int proc_root_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f0358fe410d3..ba3525ccc27e 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -22,7 +22,7 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
-static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long addr,
 						   unsigned long len,
@@ -158,7 +158,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
  * handle a change of attributes
  * - we're specifically interested in a change of size
  */
-static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+static int ramfs_nommu_setattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, struct iattr *ia)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int old_ia_valid = ia->ia_valid;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3fd4326f36b5..3c2658c8fde0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -101,7 +101,8 @@ struct inode *ramfs_get_inode(struct super_block *sb,
  */
 /* SMP-safe */
 static int
-ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
 	int error = -ENOSPC;
@@ -115,20 +116,23 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	return error;
 }
 
-static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
-	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
 	return retval;
 }
 
-static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
+	return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
-static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index 0c1c847f992f..fd58618da360 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,7 +49,8 @@ static inline int reiserfs_acl_count(size_t size)
 
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
-int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		     struct posix_acl *acl, int type);
 int reiserfs_acl_chmod(struct inode *inode);
 int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 				 struct inode *dir, struct dentry *dentry,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 944f2b487cf8..780bb90c1804 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3282,7 +3282,8 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
-int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
+int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index a67a7d371725..e6eb05e2b2f1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -619,8 +619,8 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
 	return dquot_initialize(inode);
 }
 
-static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			   bool excl)
+static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, bool excl)
 {
 	int retval;
 	struct inode *inode;
@@ -698,8 +698,8 @@ out_failed:
 	return retval;
 }
 
-static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-			  dev_t rdev)
+static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	int retval;
 	struct inode *inode;
@@ -781,7 +781,8 @@ out_failed:
 	return retval;
 }
 
-static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int retval;
 	struct inode *inode;
@@ -1094,8 +1095,9 @@ out_unlink:
 	return retval;
 }
 
-static int reiserfs_symlink(struct inode *parent_dir,
-			    struct dentry *dentry, const char *symname)
+static int reiserfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *parent_dir, struct dentry *dentry,
+			    const char *symname)
 {
 	int retval;
 	struct inode *inode;
@@ -1304,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
  * one path. If it holds 2 or more, it can get into endless waiting in
  * get_empty_nodes or its clones
  */
-static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int reiserfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags)
 {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f69871516167..0ca2ac62e534 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -3102,7 +3102,8 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
 }
 
 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
+int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *attr);
 
 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ec440d1957a1..bd073836e141 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -66,14 +66,14 @@
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->create(dir, dentry, mode, true);
+	return dir->i_op->create(&init_user_ns, dir, dentry, mode, true);
 }
 #endif
 
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->mkdir(dir, dentry, mode);
+	return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode);
 }
 
 /*
@@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data)
 	 * ATTR_MODE is set.
 	 */
 	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
-	err = reiserfs_setattr(dentry, attrs);
+	err = reiserfs_setattr(&init_user_ns, dentry, attrs);
 	attrs->ia_valid = ia_valid;
 
 	return err;
@@ -604,7 +604,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
 		inode_dio_wait(d_inode(dentry));
 
-		err = reiserfs_setattr(dentry, &newattrs);
+		err = reiserfs_setattr(&init_user_ns, dentry, &newattrs);
 		inode_unlock(d_inode(dentry));
 	} else
 		update_ctime(inode);
@@ -948,7 +948,8 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask)
 {
 	/*
 	 * We don't do permission checks on the internal objects.
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index c764352447ba..9b3b06da568c 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -16,7 +16,8 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct inode *inode, int mask);
+int reiserfs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4bf976bc7bad..a9547144a099 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,8 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
 
 
 int
-reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	int error, error2;
 	struct reiserfs_transaction_handle th;
diff --git a/fs/stat.c b/fs/stat.c
index 2c471c2fd766..fbc171d038aa 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,6 +75,7 @@ EXPORT_SYMBOL(generic_fillattr);
 int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 		      u32 request_mask, unsigned int query_flags)
 {
+	struct user_namespace *mnt_userns;
 	struct inode *inode = d_backing_inode(path->dentry);
 
 	memset(stat, 0, sizeof(*stat));
@@ -91,11 +92,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 	if (IS_DAX(inode))
 		stat->attributes |= STATX_ATTR_DAX;
 
+	mnt_userns = mnt_user_ns(path->mnt);
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(path, stat, request_mask,
-					    query_flags);
+		return inode->i_op->getattr(mnt_userns, path, stat,
+					    request_mask, query_flags);
 
-	generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
+	generic_fillattr(mnt_userns, inode, stat);
 	return 0;
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index ca7e216b7b9e..90e00124ea07 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -29,7 +29,8 @@ const struct file_operations sysv_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
-static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+static int sysv_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 83cffab6955f..8b2e99b7bc9f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -441,8 +441,8 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 	return blocks;
 }
 
-int sysv_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct super_block *s = path->dentry->d_sb;
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index ea2414b385ec..b2e6abc06a2d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -41,7 +41,8 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un
 	return d_splice_alias(inode, dentry);
 }
 
-static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
+static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
 	int err;
@@ -60,13 +61,14 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode,
 	return err;
 }
 
-static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
-	return sysv_mknod(dir, dentry, mode, 0);
+	return sysv_mknod(&init_user_ns, dir, dentry, mode, 0);
 }
 
-static int sysv_symlink(struct inode * dir, struct dentry * dentry, 
-	const char * symname)
+static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	int err = -ENAMETOOLONG;
 	int l = strlen(symname)+1;
@@ -108,7 +110,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -186,9 +189,9 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
-		       struct inode * new_dir, struct dentry * new_dentry,
-		       unsigned int flags)
+static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
 	struct inode * new_inode = d_inode(new_dentry);
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 1cff585526b1..99ddf033da4f 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -141,7 +141,8 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int sysv_getattr(struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
 extern int sysv_init_icache(void);
 extern void sysv_destroy_icache(void);
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 0ee8c6dfb036..4b83cbded559 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -67,7 +67,9 @@ static char *get_dname(struct dentry *dentry)
 	return name;
 }
 
-static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns,
+				 struct inode *inode, struct dentry *dentry,
+				 umode_t mode)
 {
 	char *name;
 	int ret;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index a8881ed61620..d9d8d7794eff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -280,8 +280,8 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry,
 	return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm);
 }
 
-static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
@@ -441,8 +441,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
-			 umode_t mode)
+static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	return do_tmpfile(dir, dentry, mode, NULL);
 }
@@ -942,7 +942,8 @@ out_fname:
 	return err;
 }
 
-static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -1013,8 +1014,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
-		       umode_t mode, dev_t rdev)
+static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	struct ubifs_inode *ui;
@@ -1102,8 +1103,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	struct ubifs_inode *ui;
@@ -1542,7 +1543,8 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	return err;
 }
 
-static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int ubifs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
@@ -1566,8 +1568,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-int ubifs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	loff_t size;
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 76ef392b1e41..0e4b4be3aa26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1257,7 +1257,8 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	return err;
 }
 
-int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	int err;
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2cdde3b549..7fdfdbda4b8a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1989,13 +1989,14 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 
 /* file.c */
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
-int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
 int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode);
-int ubifs_getattr(const struct path *path, struct kstat *stat,
+int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
 		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7c7d161315c2..2846dcd92197 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -253,7 +253,8 @@ const struct file_operations udf_file_operations = {
 	.llseek			= generic_file_llseek,
 };
 
-static int udf_setattr(struct dentry *dentry, struct iattr *attr)
+static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e169d8fe35b5..f146b3089f3d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -604,8 +604,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	return 0;
 }
 
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int udf_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode = udf_new_inode(dir, mode);
 
@@ -623,7 +623,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return udf_add_nondir(dentry, inode);
 }
 
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = udf_new_inode(dir, mode);
 
@@ -642,8 +643,8 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
-static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		     dev_t rdev)
+static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 
@@ -658,7 +659,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return udf_add_nondir(dentry, inode);
 }
 
-static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct udf_fileident_bh fibh;
@@ -877,8 +879,8 @@ out:
 	return retval;
 }
 
-static int udf_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *symname)
+static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *symname)
 {
 	struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
 	struct pathComponent *pc;
@@ -1065,9 +1067,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 /* Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 54a44d1f023c..9b223421a3c5 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -152,8 +152,9 @@ out_unmap:
 	return err;
 }
 
-static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
-				u32 request_mask, unsigned int flags)
+static int udf_symlink_getattr(struct user_namespace *mnt_userns,
+			       const struct path *path, struct kstat *stat,
+			       u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = d_backing_inode(dentry);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 6b51f3b20143..debc282c1bb4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1211,7 +1211,8 @@ out:
 	return err;
 }
 
-int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 9ef40f100415..29d5a0e0c8f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -69,7 +69,8 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+static int ufs_create (struct user_namespace * mnt_userns,
+		struct inode * dir, struct dentry * dentry, umode_t mode,
 		bool excl)
 {
 	struct inode *inode;
@@ -85,7 +86,8 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
 	return ufs_add_nondir(dentry, inode);
 }
 
-static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	int err;
@@ -104,8 +106,8 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev
 	return err;
 }
 
-static int ufs_symlink (struct inode * dir, struct dentry * dentry,
-	const char * symname)
+static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, const char * symname)
 {
 	struct super_block * sb = dir->i_sb;
 	int err;
@@ -164,7 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	return error;
 }
 
-static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -240,9 +243,9 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index b49e0efdf3d7..550f7c5a3636 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -123,7 +123,8 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 
 /* namei.c */
 extern const struct file_operations ufs_dir_operations;
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 4d569f14a8d8..7aee0ec63ade 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -288,13 +288,15 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
 	return 0;
 }
 
-static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns,
+			     struct inode *parent, struct dentry *dentry,
 			     umode_t mode, bool excl)
 {
 	return vboxsf_dir_create(parent, dentry, mode, 0);
 }
 
-static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *parent, struct dentry *dentry,
 			    umode_t mode)
 {
 	return vboxsf_dir_create(parent, dentry, mode, 1);
@@ -332,7 +334,8 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
 	return 0;
 }
 
-static int vboxsf_dir_rename(struct inode *old_parent,
+static int vboxsf_dir_rename(struct user_namespace *mnt_userns,
+			     struct inode *old_parent,
 			     struct dentry *old_dentry,
 			     struct inode *new_parent,
 			     struct dentry *new_dentry,
@@ -374,7 +377,8 @@ err_put_old_path:
 	return err;
 }
 
-static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_symlink(struct user_namespace *mnt_userns,
+			      struct inode *parent, struct dentry *dentry,
 			      const char *symname)
 {
 	struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index d2cd1c99f48e..3b847e3fba24 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -212,8 +212,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	return 0;
 }
 
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int flags)
+int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *kstat, u32 request_mask, unsigned int flags)
 {
 	int err;
 	struct dentry *dentry = path->dentry;
@@ -237,7 +237,8 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat,
 	return 0;
 }
 
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr)
+int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr)
 {
 	struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry));
 	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h
index 18f95b00fc33..760524e78c88 100644
--- a/fs/vboxsf/vfsmod.h
+++ b/fs/vboxsf/vfsmod.h
@@ -90,9 +90,11 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path,
 		struct shfl_fsobjinfo *info);
 int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info);
 int vboxsf_inode_revalidate(struct dentry *dentry);
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int query_flags);
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr);
+int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *kstat, u32 request_mask,
+		   unsigned int query_flags);
+int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr);
 struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi,
 					    struct dentry *dentry);
 int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 368351298bd5..332e87153c6c 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -238,7 +238,8 @@ xfs_acl_set_mode(
 }
 
 int
-xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	    struct posix_acl *acl, int type)
 {
 	umode_t mode;
 	bool set_mode = false;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index c042c0868016..7bdb3a4ed798 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,8 @@ struct posix_acl;
 
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		       struct posix_acl *acl, int type);
 extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 void xfs_forget_acl(struct inode *inode, const char *name);
 #else
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 26d22edef741..f5dfa128af64 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -220,29 +220,32 @@ xfs_generic_create(
 
 STATIC int
 xfs_vn_mknod(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode,
-	dev_t		rdev)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	dev_t			rdev)
 {
 	return xfs_generic_create(dir, dentry, mode, rdev, false);
 }
 
 STATIC int
 xfs_vn_create(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode,
-	bool		flags)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	bool			flags)
 {
 	return xfs_generic_create(dir, dentry, mode, 0, false);
 }
 
 STATIC int
 xfs_vn_mkdir(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode)
 {
 	return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
 }
@@ -361,9 +364,10 @@ xfs_vn_unlink(
 
 STATIC int
 xfs_vn_symlink(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	const char	*symname)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	const char		*symname)
 {
 	struct inode	*inode;
 	struct xfs_inode *cip = NULL;
@@ -403,11 +407,12 @@ xfs_vn_symlink(
 
 STATIC int
 xfs_vn_rename(
-	struct inode	*odir,
-	struct dentry	*odentry,
-	struct inode	*ndir,
-	struct dentry	*ndentry,
-	unsigned int	flags)
+	struct user_namespace	*mnt_userns,
+	struct inode		*odir,
+	struct dentry		*odentry,
+	struct inode		*ndir,
+	struct dentry		*ndentry,
+	unsigned int		flags)
 {
 	struct inode	*new_inode = d_inode(ndentry);
 	int		omode = 0;
@@ -529,6 +534,7 @@ xfs_stat_blksize(
 
 STATIC int
 xfs_vn_getattr(
+	struct user_namespace	*mnt_userns,
 	const struct path	*path,
 	struct kstat		*stat,
 	u32			request_mask,
@@ -1047,6 +1053,7 @@ xfs_vn_setattr_size(
 
 STATIC int
 xfs_vn_setattr(
+	struct user_namespace	*mnt_userns,
 	struct dentry		*dentry,
 	struct iattr		*iattr)
 {
@@ -1144,9 +1151,10 @@ xfs_vn_fiemap(
 
 STATIC int
 xfs_vn_tmpfile(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode)
 {
 	return xfs_generic_create(dir, dentry, mode, 0, true);
 }
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8a1f69677784..76e45d66d4ce 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -480,7 +480,8 @@ unlock:
 	return ret;
 }
 
-static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
+static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
+				struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0601cca1930..7762d3d75230 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1930,22 +1930,28 @@ struct file_operations {
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
-	int (*permission) (struct inode *, int);
+	int (*permission) (struct user_namespace *, struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
 
-	int (*create) (struct inode *,struct dentry *, umode_t, bool);
+	int (*create) (struct user_namespace *, struct inode *,struct dentry *,
+		       umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
-	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,umode_t);
+	int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,
+			const char *);
+	int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,
+		      umode_t);
 	int (*rmdir) (struct inode *,struct dentry *);
-	int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-	int (*rename) (struct inode *, struct dentry *,
+	int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,
+		      umode_t,dev_t);
+	int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
-	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+	int (*setattr) (struct user_namespace *, struct dentry *,
+			struct iattr *);
+	int (*getattr) (struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
@@ -1953,8 +1959,10 @@ struct inode_operations {
 	int (*atomic_open)(struct inode *, struct dentry *,
 			   struct file *, unsigned open_flag,
 			   umode_t create_mode);
-	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
-	int (*set_acl)(struct inode *, struct posix_acl *, int);
+	int (*tmpfile) (struct user_namespace *, struct inode *,
+			struct dentry *, umode_t);
+	int (*set_acl)(struct user_namespace *, struct inode *,
+		       struct posix_acl *, int);
 } ____cacheline_aligned;
 
 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
@@ -3227,15 +3235,18 @@ extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, struct dir_context *);
-extern int simple_setattr(struct dentry *, struct iattr *);
-extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int simple_setattr(struct user_namespace *, struct dentry *,
+			  struct iattr *);
+extern int simple_getattr(struct user_namespace *, const struct path *,
+			  struct kstat *, u32, unsigned int);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
-extern int simple_rename(struct inode *, struct dentry *,
-			 struct inode *, struct dentry *, unsigned int);
+extern int simple_rename(struct user_namespace *, struct inode *,
+			 struct dentry *, struct inode *, struct dentry *,
+			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 681ed98e4ba8..8c6c4e32fc2f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -379,10 +379,11 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr);
-extern int nfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int nfs_getattr(struct user_namespace *, const struct path *,
+		       struct kstat *, u32, unsigned int);
 extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
-extern int nfs_permission(struct inode *, int);
+extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -390,7 +391,7 @@ extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
-extern int nfs_setattr(struct dentry *, struct iattr *);
+extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
 extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 				struct nfs4_label *label);
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 6dcd8b8f6ab5..307094ebb88c 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -79,7 +79,8 @@ extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *,
 			  struct posix_acl **);
 
-extern int simple_set_acl(struct inode *, struct posix_acl *, int);
+extern int simple_set_acl(struct user_namespace *, struct inode *,
+			  struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index fcd56e077733..8031464ed4ae 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -594,8 +594,8 @@ out_unlock:
 	return error;
 }
 
-static int mqueue_create(struct inode *dir, struct dentry *dentry,
-				umode_t mode, bool excl)
+static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, bool excl)
 {
 	return mqueue_create_attr(dentry, mode, NULL);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 05b1f51d15e0..1576ff331ee4 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -152,7 +152,8 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
 	dir->i_ctime = dir->i_mtime;
 }
 
-static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
@@ -381,8 +382,8 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 	return simple_lookup(dir, dentry, flags);
 }
 
-static int bpf_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *target)
+static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *target)
 {
 	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
 	struct inode *inode;
diff --git a/mm/shmem.c b/mm/shmem.c
index 339d5530d3a9..facdd1a9c524 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1060,7 +1060,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-static int shmem_getattr(const struct path *path, struct kstat *stat,
+static int shmem_getattr(struct user_namespace *mnt_userns,
+			 const struct path *path, struct kstat *stat,
 			 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = path->dentry->d_inode;
@@ -1080,7 +1081,8 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2917,7 +2919,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  * File creation. Allocate an inode, and we're done..
  */
 static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -2946,7 +2949,8 @@ out_iput:
 }
 
 static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -2969,20 +2973,22 @@ out_iput:
 	return error;
 }
 
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	int error;
 
-	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+				 mode | S_IFDIR, 0)))
 		return error;
 	inc_nlink(dir);
 	return 0;
 }
 
-static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
 /*
@@ -3062,7 +3068,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
 	return 0;
 }
 
-static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+static int shmem_whiteout(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry)
 {
 	struct dentry *whiteout;
 	int error;
@@ -3071,7 +3078,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
 	if (!whiteout)
 		return -ENOMEM;
 
-	error = shmem_mknod(old_dir, whiteout,
+	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
 			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
 	dput(whiteout);
 	if (error)
@@ -3094,7 +3101,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+static int shmem_rename2(struct user_namespace *mnt_userns,
+			 struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry,
+			 unsigned int flags)
 {
 	struct inode *inode = d_inode(old_dentry);
 	int they_are_dirs = S_ISDIR(inode->i_mode);
@@ -3111,7 +3121,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
 	if (flags & RENAME_WHITEOUT) {
 		int error;
 
-		error = shmem_whiteout(old_dir, old_dentry);
+		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
 		if (error)
 			return error;
 	}
@@ -3135,7 +3145,8 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
 	return 0;
 }
 
-static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	int error;
 	int len;
diff --git a/net/socket.c b/net/socket.c
index c76703c6f480..2826698ff97c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -538,9 +538,10 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 	return used;
 }
 
-static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+static int sockfs_setattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *iattr)
 {
-	int err = simple_setattr(dentry, iattr);
+	int err = simple_setattr(&init_user_ns, dentry, iattr);
 
 	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index f95c6bfa8b8e..2ee3b3d29f10 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -1773,7 +1773,8 @@ fail2:
 	return error;
 }
 
-static int ns_mkdir_op(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ns_mkdir_op(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct aa_ns *ns, *parent;
 	/* TODO: improve permission check */
diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c
index cfc3075769bb..bbc85637e18b 100644
--- a/security/integrity/evm/evm_secfs.c
+++ b/security/integrity/evm/evm_secfs.c
@@ -219,7 +219,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
 		newattrs.ia_valid = ATTR_MODE;
 		inode = evm_xattrs->d_inode;
 		inode_lock(inode);
-		err = simple_setattr(evm_xattrs, &newattrs);
+		err = simple_setattr(&init_user_ns, evm_xattrs, &newattrs);
 		inode_unlock(inode);
 		if (!err)
 			err = count;
-- 
cgit v1.2.3


From a2d2329e30e224ea68d575d2525b866df9805ea0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:45 +0100
Subject: ima: handle idmapped mounts

IMA does sometimes access the inode's i_uid and compares it against the
rules' fowner. Enable IMA to handle idmapped mounts by passing down the
mount's user namespace. We simply make use of the helpers we introduced
before. If the initial user namespace is passed nothing changes so
non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-27-christian.brauner@ubuntu.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                                    |  2 +-
 fs/namei.c                                   |  4 +--
 include/linux/ima.h                          | 18 +++++++++-----
 security/integrity/ima/ima.h                 | 19 ++++++++------
 security/integrity/ima/ima_api.c             | 10 +++++---
 security/integrity/ima/ima_appraise.c        | 15 ++++++-----
 security/integrity/ima/ima_asymmetric_keys.c |  3 ++-
 security/integrity/ima/ima_main.c            | 37 ++++++++++++++++++----------
 security/integrity/ima/ima_policy.c          | 20 +++++++++------
 security/integrity/ima/ima_queue_keys.c      |  4 ++-
 10 files changed, 83 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/fs/attr.c b/fs/attr.c
index 41abd0d973d8..87ef39db1c34 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -401,7 +401,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	if (!error) {
 		fsnotify_change(dentry, ia_valid);
-		ima_inode_post_setattr(dentry);
+		ima_inode_post_setattr(mnt_userns, dentry);
 		evm_inode_post_setattr(dentry, ia_valid);
 	}
 
diff --git a/fs/namei.c b/fs/namei.c
index d9ceb75ac169..dbf53b325ac9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3422,7 +3422,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	ima_post_create_tmpfile(inode);
+	ima_post_create_tmpfile(mnt_userns, inode);
 	return child;
 
 out_err:
@@ -3750,7 +3750,7 @@ retry:
 			error = vfs_create(mnt_userns, path.dentry->d_inode,
 					   dentry, mode, true);
 			if (!error)
-				ima_post_path_mknod(dentry);
+				ima_post_path_mknod(mnt_userns, dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 7db9cca1af34..5486312d8ee6 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -16,7 +16,8 @@ struct linux_binprm;
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
-extern void ima_post_create_tmpfile(struct inode *inode);
+extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+				    struct inode *inode);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
@@ -27,7 +28,8 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id,
 			 bool contents);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
-extern void ima_post_path_mknod(struct dentry *dentry);
+extern void ima_post_path_mknod(struct user_namespace *mnt_userns,
+				struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
@@ -68,7 +70,8 @@ static inline int ima_file_check(struct file *file, int mask)
 	return 0;
 }
 
-static inline void ima_post_create_tmpfile(struct inode *inode)
+static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+					   struct inode *inode)
 {
 }
 
@@ -112,7 +115,8 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	return 0;
 }
 
-static inline void ima_post_path_mknod(struct dentry *dentry)
+static inline void ima_post_path_mknod(struct user_namespace *mnt_userns,
+				       struct dentry *dentry)
 {
 	return;
 }
@@ -153,7 +157,8 @@ static inline void ima_post_key_create_or_update(struct key *keyring,
 
 #ifdef CONFIG_IMA_APPRAISE
 extern bool is_ima_appraise_enabled(void);
-extern void ima_inode_post_setattr(struct dentry *dentry);
+extern void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+				   struct dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 		       const void *xattr_value, size_t xattr_value_len);
 extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name);
@@ -163,7 +168,8 @@ static inline bool is_ima_appraise_enabled(void)
 	return 0;
 }
 
-static inline void ima_inode_post_setattr(struct dentry *dentry)
+static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+					  struct dentry *dentry)
 {
 	return;
 }
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 8e8b1e3cb847..b87c9006d577 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -253,8 +253,9 @@ static inline void ima_process_queued_keys(void) {}
 #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */
 
 /* LIM API function definitions */
-int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-		   int mask, enum ima_hooks func, int *pcr,
+int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+		   const struct cred *cred, u32 secid, int mask,
+		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *keyring);
 int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
@@ -266,7 +267,8 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
 			   struct evm_ima_xattr_data *xattr_value,
 			   int xattr_len, const struct modsig *modsig, int pcr,
 			   struct ima_template_desc *template_desc);
-void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+void process_buffer_measurement(struct user_namespace *mnt_userns,
+				struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
@@ -281,8 +283,9 @@ void ima_free_template_entry(struct ima_template_entry *entry);
 const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
 
 /* IMA policy related functions */
-int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-		     enum ima_hooks func, int mask, int flags, int *pcr,
+int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct cred *cred, u32 secid, enum ima_hooks func,
+		     int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *keyring);
 void ima_init_policy(void);
@@ -313,7 +316,8 @@ int ima_appraise_measurement(enum ima_hooks func,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
 			     int xattr_len, const struct modsig *modsig);
-int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
+int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+		      int mask, enum ima_hooks func);
 void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
 					   enum ima_hooks func);
@@ -340,7 +344,8 @@ static inline int ima_appraise_measurement(enum ima_hooks func,
 	return INTEGRITY_UNKNOWN;
 }
 
-static inline int ima_must_appraise(struct inode *inode, int mask,
+static inline int ima_must_appraise(struct user_namespace *mnt_userns,
+				    struct inode *inode, int mask,
 				    enum ima_hooks func)
 {
 	return 0;
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 4f39fb93f278..ed410efb3597 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -162,6 +162,7 @@ err_out:
 
 /**
  * ima_get_action - appraise & measure decision based on policy.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: pointer to the inode associated with the object being validated
  * @cred: pointer to credentials structure to validate
  * @secid: secid of the task being validated
@@ -183,8 +184,9 @@ err_out:
  * Returns IMA_MEASURE, IMA_APPRAISE mask.
  *
  */
-int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-		   int mask, enum ima_hooks func, int *pcr,
+int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+		   const struct cred *cred, u32 secid, int mask,
+		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *keyring)
 {
@@ -192,8 +194,8 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 
 	flags &= ima_policy_flag;
 
-	return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
-				template_desc, keyring);
+	return ima_match_policy(mnt_userns, inode, cred, secid, func, mask,
+				flags, pcr, template_desc, keyring);
 }
 
 /*
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 70b643c41c6b..2e64b9f281cc 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -68,7 +68,8 @@ bool is_ima_appraise_enabled(void)
  *
  * Return 1 to appraise or hash
  */
-int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
+int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+		      int mask, enum ima_hooks func)
 {
 	u32 secid;
 
@@ -76,8 +77,8 @@ int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
 		return 0;
 
 	security_task_getsecid(current, &secid);
-	return ima_match_policy(inode, current_cred(), secid, func, mask,
-				IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
+	return ima_match_policy(mnt_userns, inode, current_cred(), secid, func,
+				mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
 }
 
 static int ima_fix_xattr(struct dentry *dentry,
@@ -350,7 +351,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint,
 
 		rc = is_binary_blacklisted(digest, digestsize);
 		if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
-			process_buffer_measurement(NULL, digest, digestsize,
+			process_buffer_measurement(&init_user_ns, NULL, digest, digestsize,
 						   "blacklisted-hash", NONE,
 						   pcr, NULL);
 	}
@@ -501,6 +502,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file)
 
 /**
  * ima_inode_post_setattr - reflect file metadata changes
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry: pointer to the affected dentry
  *
  * Changes to a dentry's metadata might result in needing to appraise.
@@ -508,7 +510,8 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file)
  * This function is called from notify_change(), which expects the caller
  * to lock the inode's i_mutex.
  */
-void ima_inode_post_setattr(struct dentry *dentry)
+void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	struct integrity_iint_cache *iint;
@@ -518,7 +521,7 @@ void ima_inode_post_setattr(struct dentry *dentry)
 	    || !(inode->i_opflags & IOP_XATTR))
 		return;
 
-	action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
+	action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR);
 	if (!action)
 		__vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
 	iint = integrity_iint_find(inode);
diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c
index 1c68c500c26f..c4ef69100789 100644
--- a/security/integrity/ima/ima_asymmetric_keys.c
+++ b/security/integrity/ima/ima_asymmetric_keys.c
@@ -10,6 +10,7 @@
  */
 
 #include <keys/asymmetric-type.h>
+#include <linux/user_namespace.h>
 #include "ima.h"
 
 /**
@@ -58,7 +59,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	 * if the IMA policy is configured to measure a key linked
 	 * to the given keyring.
 	 */
-	process_buffer_measurement(NULL, payload, payload_len,
+	process_buffer_measurement(&init_user_ns, NULL, payload, payload_len,
 				   keyring->description, KEY_CHECK, 0,
 				   keyring->description);
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index f87cb29329e9..cb1c56eccd6d 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -218,8 +218,8 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	 * bitmask based on the appraise/audit/measurement policy.
 	 * Included is the appraise submask.
 	 */
-	action = ima_get_action(inode, cred, secid, mask, func, &pcr,
-				&template_desc, NULL);
+	action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid,
+				mask, func, &pcr, &template_desc, NULL);
 	violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) &&
 			   (ima_policy_flag & IMA_MEASURE));
 	if (!action && !violation_check)
@@ -431,8 +431,9 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot)
 
 	security_task_getsecid(current, &secid);
 	inode = file_inode(vma->vm_file);
-	action = ima_get_action(inode, current_cred(), secid, MAY_EXEC,
-				MMAP_CHECK, &pcr, &template, 0);
+	action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode,
+				current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+				&pcr, &template, 0);
 
 	/* Is the mmap'ed file in policy? */
 	if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
@@ -592,18 +593,21 @@ EXPORT_SYMBOL_GPL(ima_inode_hash);
 
 /**
  * ima_post_create_tmpfile - mark newly created tmpfile as new
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @file : newly created tmpfile
  *
  * No measuring, appraising or auditing of newly created tmpfiles is needed.
  * Skip calling process_measurement(), but indicate which newly, created
  * tmpfiles are in policy.
  */
-void ima_post_create_tmpfile(struct inode *inode)
+void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *inode)
 {
 	struct integrity_iint_cache *iint;
 	int must_appraise;
 
-	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+					  FILE_CHECK);
 	if (!must_appraise)
 		return;
 
@@ -619,18 +623,21 @@ void ima_post_create_tmpfile(struct inode *inode)
 
 /**
  * ima_post_path_mknod - mark as a new inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry: newly created dentry
  *
  * Mark files created via the mknodat syscall as new, so that the
  * file data can be written later.
  */
-void ima_post_path_mknod(struct dentry *dentry)
+void ima_post_path_mknod(struct user_namespace *mnt_userns,
+			 struct dentry *dentry)
 {
 	struct integrity_iint_cache *iint;
 	struct inode *inode = dentry->d_inode;
 	int must_appraise;
 
-	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+					  FILE_CHECK);
 	if (!must_appraise)
 		return;
 
@@ -810,6 +817,7 @@ int ima_post_load_data(char *buf, loff_t size,
 
 /*
  * process_buffer_measurement - Measure the buffer to ima log.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
  * @buf: pointer to the buffer that needs to be added to the log.
  * @size: size of buffer(in bytes).
@@ -820,7 +828,8 @@ int ima_post_load_data(char *buf, loff_t size,
  *
  * Based on policy, the buffer is measured into the ima log.
  */
-void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+void process_buffer_measurement(struct user_namespace *mnt_userns,
+				struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring)
 {
@@ -860,8 +869,9 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
 	 */
 	if (func) {
 		security_task_getsecid(current, &secid);
-		action = ima_get_action(inode, current_cred(), secid, 0, func,
-					&pcr, &template, keyring);
+		action = ima_get_action(mnt_userns, inode, current_cred(),
+					secid, 0, func, &pcr, &template,
+					keyring);
 		if (!(action & IMA_MEASURE))
 			return;
 	}
@@ -919,8 +929,9 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 	if (!f.file)
 		return;
 
-	process_buffer_measurement(file_inode(f.file), buf, size,
-				   "kexec-cmdline", KEXEC_CMDLINE, 0, NULL);
+	process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file),
+				   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
+				   NULL);
 	fdput(f);
 }
 
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 823a0c1379cb..e14426c24a19 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -488,6 +488,7 @@ static bool ima_match_keyring(struct ima_rule_entry *rule,
 /**
  * ima_match_rules - determine whether an inode matches the policy rule.
  * @rule: a pointer to a rule
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: a pointer to an inode
  * @cred: a pointer to a credentials structure for user validation
  * @secid: the secid of the task to be validated
@@ -497,9 +498,10 @@ static bool ima_match_keyring(struct ima_rule_entry *rule,
  *
  * Returns true on rule match, false on failure.
  */
-static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
-			    const struct cred *cred, u32 secid,
-			    enum ima_hooks func, int mask,
+static bool ima_match_rules(struct ima_rule_entry *rule,
+			    struct user_namespace *mnt_userns,
+			    struct inode *inode, const struct cred *cred,
+			    u32 secid, enum ima_hooks func, int mask,
 			    const char *keyring)
 {
 	int i;
@@ -539,7 +541,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
 	}
 
 	if ((rule->flags & IMA_FOWNER) &&
-	    !rule->fowner_op(inode->i_uid, rule->fowner))
+	    !rule->fowner_op(i_uid_into_mnt(mnt_userns, inode), rule->fowner))
 		return false;
 	for (i = 0; i < MAX_LSM_RULES; i++) {
 		int rc = 0;
@@ -602,6 +604,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
 
 /**
  * ima_match_policy - decision based on LSM and other conditions
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: pointer to an inode for which the policy decision is being made
  * @cred: pointer to a credentials structure for which the policy decision is
  *        being made
@@ -620,8 +623,9 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  * list when walking it.  Reads are many orders of magnitude more numerous
  * than writes so ima_match_policy() is classical RCU candidate.
  */
-int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-		     enum ima_hooks func, int mask, int flags, int *pcr,
+int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct cred *cred, u32 secid, enum ima_hooks func,
+		     int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *keyring)
 {
@@ -637,8 +641,8 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
 		if (!(entry->action & actmask))
 			continue;
 
-		if (!ima_match_rules(entry, inode, cred, secid, func, mask,
-				     keyring))
+		if (!ima_match_rules(entry, mnt_userns, inode, cred, secid,
+				     func, mask, keyring))
 			continue;
 
 		action |= entry->flags & IMA_ACTION_FLAGS;
diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c
index 69a8626a35c0..ca3dea19da18 100644
--- a/security/integrity/ima/ima_queue_keys.c
+++ b/security/integrity/ima/ima_queue_keys.c
@@ -8,6 +8,7 @@
  *       Enables deferred processing of keys
  */
 
+#include <linux/user_namespace.h>
 #include <linux/workqueue.h>
 #include <keys/asymmetric-type.h>
 #include "ima.h"
@@ -158,7 +159,8 @@ void ima_process_queued_keys(void)
 
 	list_for_each_entry_safe(entry, tmp, &ima_keys, list) {
 		if (!timer_expired)
-			process_buffer_measurement(NULL, entry->payload,
+			process_buffer_measurement(&init_user_ns, NULL,
+						   entry->payload,
 						   entry->payload_len,
 						   entry->keyring_name,
 						   KEY_CHECK, 0,
-- 
cgit v1.2.3


From 2a1867219c7b27f928e2545782b86daaf9ad50bd Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:53 +0100
Subject: fs: add mount_setattr()

This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.

The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:

int mount_setattr(int dfd, const char *path, unsigned flags,
                  struct mount_attr *uattr, size_t usize);

Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.

The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:

struct mount_attr {
	__u64 attr_set;
	__u64 attr_clr;
	__u64 propagation;
	__u64 userns_fd;
};

The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.

Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.

The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.

The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.

[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")

Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 fs/namespace.c                              | 263 ++++++++++++++++++++++++++++
 include/linux/syscalls.h                    |   4 +
 include/uapi/asm-generic/unistd.h           |   4 +-
 include/uapi/linux/mount.h                  |  15 ++
 tools/include/uapi/asm-generic/unistd.h     |   4 +-
 23 files changed, 307 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index a6617067dbe6..02f0244e005c 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@
 549	common	faccessat2			sys_faccessat2
 550	common	process_madvise			sys_process_madvise
 551	common	epoll_pwait2			sys_epoll_pwait2
+552	common	mount_setattr			sys_mount_setattr
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 20e1170e2e0a..dcc1191291a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -455,3 +455,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 86a9d7b3eabe..949788f5ba40 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		442
+#define __NR_compat_syscalls		443
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index cccfbbefbf95..3d874f624056 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index bfc00f2bd437..d89231166e19 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -362,3 +362,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7fe4e45c864c..72bde6707dd3 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a522adf194ab..d603a5ec9338 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -447,3 +447,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0f03ad223f33..8fd8c1790941 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -380,3 +380,4 @@
 439	n32	faccessat2			sys_faccessat2
 440	n32	process_madvise			sys_process_madvise
 441	n32	epoll_pwait2			compat_sys_epoll_pwait2
+442	n32	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 91649690b52f..169f21438065 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -356,3 +356,4 @@
 439	n64	faccessat2			sys_faccessat2
 440	n64	process_madvise			sys_process_madvise
 441	n64	epoll_pwait2			sys_epoll_pwait2
+442	n64	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 4bad0c40aed6..090d29ca80ff 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -429,3 +429,4 @@
 439	o32	faccessat2			sys_faccessat2
 440	o32	process_madvise			sys_process_madvise
 441	o32	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	o32	mount_setattr			sys_mount_setattr
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 6bcc31966b44..271a92519683 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index f744eb5cba88..72e5aa67ab8a 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -531,3 +531,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index d443423495e5..3abef2144dac 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
 439  common	faccessat2		sys_faccessat2			sys_faccessat2
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
 441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 9df40ac0ebc0..d08eebad6b7f 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 40d8c7cd8298..84403a99039c 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -487,3 +487,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 874aeacde2dd..a1c9f496fca6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -446,3 +446,4 @@
 439	i386	faccessat2		sys_faccessat2
 440	i386	process_madvise		sys_process_madvise
 441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	i386	mount_setattr		sys_mount_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..7bf01cbe582f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
 441	common	epoll_pwait2		sys_epoll_pwait2
+442	common	mount_setattr		sys_mount_setattr
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 46116a28eeed..365a9b849224 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -412,3 +412,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/fs/namespace.c b/fs/namespace.c
index 00ed0d6cb2ee..726a51c8c46e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -73,6 +73,14 @@ static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 
+struct mount_kattr {
+	unsigned int attr_set;
+	unsigned int attr_clr;
+	unsigned int propagation;
+	unsigned int lookup_flags;
+	bool recurse;
+};
+
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
@@ -3469,6 +3477,11 @@ out_type:
 	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
 	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
 
+#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
+
+#define MOUNT_SETATTR_PROPAGATION_FLAGS \
+	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
+
 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
 {
 	unsigned int mnt_flags = 0;
@@ -3820,6 +3833,256 @@ out0:
 	return error;
 }
 
+static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
+{
+	unsigned int flags = mnt->mnt.mnt_flags;
+
+	/*  flags to clear */
+	flags &= ~kattr->attr_clr;
+	/* flags to raise */
+	flags |= kattr->attr_set;
+
+	return flags;
+}
+
+static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
+					   struct mount *mnt, int *err)
+{
+	struct mount *m = mnt, *last = NULL;
+
+	if (!is_mounted(&m->mnt)) {
+		*err = -EINVAL;
+		goto out;
+	}
+
+	if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
+		*err = -EINVAL;
+		goto out;
+	}
+
+	do {
+		unsigned int flags;
+
+		flags = recalc_flags(kattr, m);
+		if (!can_change_locked_flags(m, flags)) {
+			*err = -EPERM;
+			goto out;
+		}
+
+		last = m;
+
+		if ((kattr->attr_set & MNT_READONLY) &&
+		    !(m->mnt.mnt_flags & MNT_READONLY)) {
+			*err = mnt_hold_writers(m);
+			if (*err)
+				goto out;
+		}
+	} while (kattr->recurse && (m = next_mnt(m, mnt)));
+
+out:
+	return last;
+}
+
+static void mount_setattr_commit(struct mount_kattr *kattr,
+				 struct mount *mnt, struct mount *last,
+				 int err)
+{
+	struct mount *m = mnt;
+
+	do {
+		if (!err) {
+			unsigned int flags;
+
+			flags = recalc_flags(kattr, m);
+			WRITE_ONCE(m->mnt.mnt_flags, flags);
+		}
+
+		/*
+		 * We either set MNT_READONLY above so make it visible
+		 * before ~MNT_WRITE_HOLD or we failed to recursively
+		 * apply mount options.
+		 */
+		if ((kattr->attr_set & MNT_READONLY) &&
+		    (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+			mnt_unhold_writers(m);
+
+		if (!err && kattr->propagation)
+			change_mnt_propagation(m, kattr->propagation);
+
+		/*
+		 * On failure, only cleanup until we found the first mount
+		 * we failed to handle.
+		 */
+		if (err && m == last)
+			break;
+	} while (kattr->recurse && (m = next_mnt(m, mnt)));
+
+	if (!err)
+		touch_mnt_namespace(mnt->mnt_ns);
+}
+
+static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
+{
+	struct mount *mnt = real_mount(path->mnt), *last = NULL;
+	int err = 0;
+
+	if (path->dentry != mnt->mnt.mnt_root)
+		return -EINVAL;
+
+	if (kattr->propagation) {
+		/*
+		 * Only take namespace_lock() if we're actually changing
+		 * propagation.
+		 */
+		namespace_lock();
+		if (kattr->propagation == MS_SHARED) {
+			err = invent_group_ids(mnt, kattr->recurse);
+			if (err) {
+				namespace_unlock();
+				return err;
+			}
+		}
+	}
+
+	lock_mount_hash();
+
+	/*
+	 * Get the mount tree in a shape where we can change mount
+	 * properties without failure.
+	 */
+	last = mount_setattr_prepare(kattr, mnt, &err);
+	if (last) /* Commit all changes or revert to the old state. */
+		mount_setattr_commit(kattr, mnt, last, err);
+
+	unlock_mount_hash();
+
+	if (kattr->propagation) {
+		namespace_unlock();
+		if (err)
+			cleanup_group_ids(mnt, NULL);
+	}
+
+	return err;
+}
+
+static int build_mount_kattr(const struct mount_attr *attr,
+			     struct mount_kattr *kattr, unsigned int flags)
+{
+	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	*kattr = (struct mount_kattr) {
+		.lookup_flags	= lookup_flags,
+		.recurse	= !!(flags & AT_RECURSIVE),
+	};
+
+	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
+		return -EINVAL;
+	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
+		return -EINVAL;
+	kattr->propagation = attr->propagation;
+
+	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
+		return -EINVAL;
+
+	if (attr->userns_fd)
+		return -EINVAL;
+
+	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
+	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
+
+	/*
+	 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
+	 * users wanting to transition to a different atime setting cannot
+	 * simply specify the atime setting in @attr_set, but must also
+	 * specify MOUNT_ATTR__ATIME in the @attr_clr field.
+	 * So ensure that MOUNT_ATTR__ATIME can't be partially set in
+	 * @attr_clr and that @attr_set can't have any atime bits set if
+	 * MOUNT_ATTR__ATIME isn't set in @attr_clr.
+	 */
+	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
+		if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
+			return -EINVAL;
+
+		/*
+		 * Clear all previous time settings as they are mutually
+		 * exclusive.
+		 */
+		kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
+		switch (attr->attr_set & MOUNT_ATTR__ATIME) {
+		case MOUNT_ATTR_RELATIME:
+			kattr->attr_set |= MNT_RELATIME;
+			break;
+		case MOUNT_ATTR_NOATIME:
+			kattr->attr_set |= MNT_NOATIME;
+			break;
+		case MOUNT_ATTR_STRICTATIME:
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		if (attr->attr_set & MOUNT_ATTR__ATIME)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+		unsigned int, flags, struct mount_attr __user *, uattr,
+		size_t, usize)
+{
+	int err;
+	struct path target;
+	struct mount_attr attr;
+	struct mount_kattr kattr;
+
+	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
+
+	if (flags & ~(AT_EMPTY_PATH |
+		      AT_RECURSIVE |
+		      AT_SYMLINK_NOFOLLOW |
+		      AT_NO_AUTOMOUNT))
+		return -EINVAL;
+
+	if (unlikely(usize > PAGE_SIZE))
+		return -E2BIG;
+	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
+		return -EINVAL;
+
+	if (!may_mount())
+		return -EPERM;
+
+	err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+	if (err)
+		return err;
+
+	/* Don't bother walking through the mounts if this is a nop. */
+	if (attr.attr_set == 0 &&
+	    attr.attr_clr == 0 &&
+	    attr.propagation == 0)
+		return 0;
+
+	err = build_mount_kattr(&attr, &kattr, flags);
+	if (err)
+		return err;
+
+	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
+	if (err)
+		return err;
+
+	err = do_mount_setattr(&target, &kattr);
+	path_put(&target);
+	return err;
+}
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7688bc983de5..cd7b5c817ba2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -68,6 +68,7 @@ union bpf_attr;
 struct io_uring_params;
 struct clone_args;
 struct open_how;
+struct mount_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1028,6 +1029,9 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
+asmlinkage long sys_mount_setattr(int dfd, const char __user *path,
+				  unsigned int flags,
+				  struct mount_attr __user *uattr, size_t usize);
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
 			     const void __user *value, int aux);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 728752917785..ce58cff99b66 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 #undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index dd8306ea336c..d3bcefdfa73d 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -1,6 +1,8 @@
 #ifndef _UAPI_LINUX_MOUNT_H
 #define _UAPI_LINUX_MOUNT_H
 
+#include <linux/types.h>
+
 /*
  * These are the fs-independent mount-flags: up to 32 flags are supported
  *
@@ -118,4 +120,17 @@ enum fsconfig_command {
 #define MOUNT_ATTR_STRICTATIME	0x00000020 /* - Always perform atime updates */
 #define MOUNT_ATTR_NODIRATIME	0x00000080 /* Do not update directory access times */
 
+/*
+ * mount_setattr()
+ */
+struct mount_attr {
+	__u64 attr_set;
+	__u64 attr_clr;
+	__u64 propagation;
+	__u64 userns_fd;
+};
+
+/* List of all mount_attr versions. */
+#define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 728752917785..ce58cff99b66 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 #undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
 
 /*
  * 32 bit systems traditionally used different
-- 
cgit v1.2.3


From 9caccd41541a6f7d6279928d9f971f6642c361af Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:54 +0100
Subject: fs: introduce MOUNT_ATTR_IDMAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new mount bind mount property to allow idmapping mounts. The
MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall
together with a file descriptor referring to a user namespace.

The user namespace referenced by the namespace file descriptor will be
attached to the bind mount. All interactions with the filesystem going
through that mount will be mapped according to the mapping specified in
the user namespace attached to it.

Using user namespaces to mark mounts means we can reuse all the existing
infrastructure in the kernel that already exists to handle idmappings
and can also use this for permission checking to allow unprivileged user
to create idmapped mounts in the future.

Idmapping a mount is decoupled from the caller's user and mount
namespace. This means idmapped mounts can be created in the initial
user namespace which is an important use-case for systemd-homed,
portable usb-sticks between systems, sharing data between the initial
user namespace and unprivileged containers, and other use-cases that
have been brought up. For example, assume a home directory where all
files are owned by uid and gid 1000 and the home directory is brought to
a new laptop where the user has id 12345. The system administrator can
simply create a mount of this home directory with a mapping of
1000:12345:1 and other mappings to indicate the ids should be kept.
(With this it is e.g. also possible to create idmapped mounts on the
host with an identity mapping 1:1:100000 where the root user is not
mapped. A user with root access that e.g. has been pivot rooted into
such a mount on the host will be not be able to execute, read, write, or
create files as root.)

Given that mapping a mount is decoupled from the caller's user namespace
a sufficiently privileged process such as a container manager can set up
an idmapped mount for the container and the container can simply pivot
root to it. There's no need for the container to do anything. The mount
will appear correctly mapped independent of the user namespace the
container uses. This means we don't need to mark a mount as idmappable.

In order to create an idmapped mount the caller must currently be
privileged in the user namespace of the superblock the mount belongs to.
Once a mount has been idmapped we don't allow it to change its mapping.
This keeps permission checking and life-cycle management simple. Users
wanting to change the idmapped can always create a new detached mount
with a different idmapping.

Link: https://lore.kernel.org/r/20210121131959.646623-36-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Mauricio Vásquez Bernal <mauricio@kinvolk.io>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namespace.c             | 122 ++++++++++++++++++++++++++++++++++++++++++---
 fs/proc_namespace.c        |   3 ++
 include/linux/mount.h      |   3 +-
 include/uapi/linux/mount.h |   1 +
 4 files changed, 121 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 726a51c8c46e..062fe984a697 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/memblock.h>
+#include <linux/proc_fs.h>
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
@@ -79,6 +80,7 @@ struct mount_kattr {
 	unsigned int propagation;
 	unsigned int lookup_flags;
 	bool recurse;
+	struct user_namespace *mnt_userns;
 };
 
 /* /sys/fs */
@@ -3477,7 +3479,7 @@ out_type:
 	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
 	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
 
-#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
+#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
 
 #define MOUNT_SETATTR_PROPAGATION_FLAGS \
 	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
@@ -3845,6 +3847,36 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
 	return flags;
 }
 
+static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+	struct vfsmount *m = &mnt->mnt;
+
+	if (!kattr->mnt_userns)
+		return 0;
+
+	/*
+	 * Once a mount has been idmapped we don't allow it to change its
+	 * mapping. It makes things simpler and callers can just create
+	 * another bind-mount they can idmap if they want to.
+	 */
+	if (mnt_user_ns(m) != &init_user_ns)
+		return -EPERM;
+
+	/* The underlying filesystem doesn't support idmapped mounts yet. */
+	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
+		return -EINVAL;
+
+	/* We're not controlling the superblock. */
+	if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Mount has already been visible in the filesystem hierarchy. */
+	if (!is_anon_ns(mnt->mnt_ns))
+		return -EINVAL;
+
+	return 0;
+}
+
 static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
 					   struct mount *mnt, int *err)
 {
@@ -3869,6 +3901,10 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
 			goto out;
 		}
 
+		*err = can_idmap_mount(kattr, m);
+		if (*err)
+			goto out;
+
 		last = m;
 
 		if ((kattr->attr_set & MNT_READONLY) &&
@@ -3883,6 +3919,18 @@ out:
 	return last;
 }
 
+static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+	struct user_namespace *mnt_userns;
+
+	if (!kattr->mnt_userns)
+		return;
+
+	mnt_userns = get_user_ns(kattr->mnt_userns);
+	/* Pairs with smp_load_acquire() in mnt_user_ns(). */
+	smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
+}
+
 static void mount_setattr_commit(struct mount_kattr *kattr,
 				 struct mount *mnt, struct mount *last,
 				 int err)
@@ -3893,6 +3941,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr,
 		if (!err) {
 			unsigned int flags;
 
+			do_idmap_mount(kattr, m);
 			flags = recalc_flags(kattr, m);
 			WRITE_ONCE(m->mnt.mnt_flags, flags);
 		}
@@ -3965,7 +4014,62 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
 	return err;
 }
 
-static int build_mount_kattr(const struct mount_attr *attr,
+static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
+				struct mount_kattr *kattr, unsigned int flags)
+{
+	int err = 0;
+	struct ns_common *ns;
+	struct user_namespace *mnt_userns;
+	struct file *file;
+
+	if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
+		return 0;
+
+	/*
+	 * We currently do not support clearing an idmapped mount. If this ever
+	 * is a use-case we can revisit this but for now let's keep it simple
+	 * and not allow it.
+	 */
+	if (attr->attr_clr & MOUNT_ATTR_IDMAP)
+		return -EINVAL;
+
+	if (attr->userns_fd > INT_MAX)
+		return -EINVAL;
+
+	file = fget(attr->userns_fd);
+	if (!file)
+		return -EBADF;
+
+	if (!proc_ns_file(file)) {
+		err = -EINVAL;
+		goto out_fput;
+	}
+
+	ns = get_proc_ns(file_inode(file));
+	if (ns->ops->type != CLONE_NEWUSER) {
+		err = -EINVAL;
+		goto out_fput;
+	}
+
+	/*
+	 * The init_user_ns is used to indicate that a vfsmount is not idmapped.
+	 * This is simpler than just having to treat NULL as unmapped. Users
+	 * wanting to idmap a mount to init_user_ns can just use a namespace
+	 * with an identity mapping.
+	 */
+	mnt_userns = container_of(ns, struct user_namespace, ns);
+	if (mnt_userns == &init_user_ns) {
+		err = -EPERM;
+		goto out_fput;
+	}
+	kattr->mnt_userns = get_user_ns(mnt_userns);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
 			     struct mount_kattr *kattr, unsigned int flags)
 {
 	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
@@ -3991,9 +4095,6 @@ static int build_mount_kattr(const struct mount_attr *attr,
 	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
 		return -EINVAL;
 
-	if (attr->userns_fd)
-		return -EINVAL;
-
 	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
 	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
 
@@ -4032,7 +4133,13 @@ static int build_mount_kattr(const struct mount_attr *attr,
 			return -EINVAL;
 	}
 
-	return 0;
+	return build_mount_idmapped(attr, usize, kattr, flags);
+}
+
+static void finish_mount_kattr(struct mount_kattr *kattr)
+{
+	put_user_ns(kattr->mnt_userns);
+	kattr->mnt_userns = NULL;
 }
 
 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -4070,7 +4177,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
 	    attr.propagation == 0)
 		return 0;
 
-	err = build_mount_kattr(&attr, &kattr, flags);
+	err = build_mount_kattr(&attr, usize, &kattr, flags);
 	if (err)
 		return err;
 
@@ -4079,6 +4186,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
 		return err;
 
 	err = do_mount_setattr(&target, &kattr);
+	finish_mount_kattr(&kattr);
 	path_put(&target);
 	return err;
 }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index eafb75755fa3..392ef5162655 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		if (mnt->mnt_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+
+	if (mnt_user_ns(mnt) != &init_user_ns)
+		seq_puts(m, ",idmapped");
 }
 
 static inline void mangle(struct seq_file *m, const char *s)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 52de25e08319..161f4419db6c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -77,7 +77,8 @@ struct vfsmount {
 
 static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
 {
-	return mnt->mnt_userns;
+	/* Pairs with smp_store_release() in do_idmap_mount(). */
+	return smp_load_acquire(&mnt->mnt_userns);
 }
 
 struct file; /* forward dec */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d3bcefdfa73d..e6524ead2b7b 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -119,6 +119,7 @@ enum fsconfig_command {
 #define MOUNT_ATTR_NOATIME	0x00000010 /* - Do not update access times. */
 #define MOUNT_ATTR_STRICTATIME	0x00000020 /* - Always perform atime updates */
 #define MOUNT_ATTR_NODIRATIME	0x00000080 /* Do not update directory access times */
+#define MOUNT_ATTR_IDMAP	0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
 
 /*
  * mount_setattr()
-- 
cgit v1.2.3


From 882227626459ce7a24191f13c6d9749a00992059 Mon Sep 17 00:00:00 2001
From: Scott Branden <scott.branden@broadcom.com>
Date: Wed, 20 Jan 2021 09:58:15 -0800
Subject: bcm-vk: add bcm_vk UAPI

Add user space api for bcm-vk driver.

Provide ioctl api to load images and issue reset command to card.
FW status registers in PCI BAR space also defined as part
of API so that user space is able to interpret these memory locations
as needed via direct PCIe access.

Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Scott Branden <scott.branden@broadcom.com>
Link: https://lore.kernel.org/r/20210120175827.14820-2-scott.branden@broadcom.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/misc/bcm_vk.h | 84 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 include/uapi/linux/misc/bcm_vk.h

(limited to 'include')

diff --git a/include/uapi/linux/misc/bcm_vk.h b/include/uapi/linux/misc/bcm_vk.h
new file mode 100644
index 000000000000..ec28e0bd46a9
--- /dev/null
+++ b/include/uapi/linux/misc/bcm_vk.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright 2018-2020 Broadcom.
+ */
+
+#ifndef __UAPI_LINUX_MISC_BCM_VK_H
+#define __UAPI_LINUX_MISC_BCM_VK_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define BCM_VK_MAX_FILENAME 64
+
+struct vk_image {
+	__u32 type; /* Type of image */
+#define VK_IMAGE_TYPE_BOOT1 1 /* 1st stage (load to SRAM) */
+#define VK_IMAGE_TYPE_BOOT2 2 /* 2nd stage (load to DDR) */
+	__u8 filename[BCM_VK_MAX_FILENAME]; /* Filename of image */
+};
+
+struct vk_reset {
+	__u32 arg1;
+	__u32 arg2;
+};
+
+#define VK_MAGIC		0x5e
+
+/* Load image to Valkyrie */
+#define VK_IOCTL_LOAD_IMAGE	_IOW(VK_MAGIC, 0x2, struct vk_image)
+
+/* Send Reset to Valkyrie */
+#define VK_IOCTL_RESET		_IOW(VK_MAGIC, 0x4, struct vk_reset)
+
+/*
+ * Firmware Status accessed directly via BAR space
+ */
+#define VK_BAR_FWSTS			0x41c
+#define VK_BAR_COP_FWSTS		0x428
+/* VK_FWSTS definitions */
+#define VK_FWSTS_RELOCATION_ENTRY	(1UL << 0)
+#define VK_FWSTS_RELOCATION_EXIT	(1UL << 1)
+#define VK_FWSTS_INIT_START		(1UL << 2)
+#define VK_FWSTS_ARCH_INIT_DONE		(1UL << 3)
+#define VK_FWSTS_PRE_KNL1_INIT_DONE	(1UL << 4)
+#define VK_FWSTS_PRE_KNL2_INIT_DONE	(1UL << 5)
+#define VK_FWSTS_POST_KNL_INIT_DONE	(1UL << 6)
+#define VK_FWSTS_INIT_DONE		(1UL << 7)
+#define VK_FWSTS_APP_INIT_START		(1UL << 8)
+#define VK_FWSTS_APP_INIT_DONE		(1UL << 9)
+#define VK_FWSTS_MASK			0xffffffff
+#define VK_FWSTS_READY			(VK_FWSTS_INIT_START | \
+					 VK_FWSTS_ARCH_INIT_DONE | \
+					 VK_FWSTS_PRE_KNL1_INIT_DONE | \
+					 VK_FWSTS_PRE_KNL2_INIT_DONE | \
+					 VK_FWSTS_POST_KNL_INIT_DONE | \
+					 VK_FWSTS_INIT_DONE | \
+					 VK_FWSTS_APP_INIT_START | \
+					 VK_FWSTS_APP_INIT_DONE)
+/* Deinit */
+#define VK_FWSTS_APP_DEINIT_START	(1UL << 23)
+#define VK_FWSTS_APP_DEINIT_DONE	(1UL << 24)
+#define VK_FWSTS_DRV_DEINIT_START	(1UL << 25)
+#define VK_FWSTS_DRV_DEINIT_DONE	(1UL << 26)
+#define VK_FWSTS_RESET_DONE		(1UL << 27)
+#define VK_FWSTS_DEINIT_TRIGGERED	(VK_FWSTS_APP_DEINIT_START | \
+					 VK_FWSTS_APP_DEINIT_DONE  | \
+					 VK_FWSTS_DRV_DEINIT_START | \
+					 VK_FWSTS_DRV_DEINIT_DONE)
+/* Last nibble for reboot reason */
+#define VK_FWSTS_RESET_REASON_SHIFT	28
+#define VK_FWSTS_RESET_REASON_MASK	(0xf << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_SYS_PWRUP	(0x0 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_MBOX_DB		(0x1 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_M7_WDOG		(0x2 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_TEMP		(0x3 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_FLR		(0x4 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_HOT		(0x5 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_WARM		(0x6 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_PCI_COLD		(0x7 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_L1		(0x8 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_L0		(0x9 << VK_FWSTS_RESET_REASON_SHIFT)
+#define VK_FWSTS_RESET_UNKNOWN		(0xf << VK_FWSTS_RESET_REASON_SHIFT)
+
+#endif /* __UAPI_LINUX_MISC_BCM_VK_H */
-- 
cgit v1.2.3


From ceecd1bff6f9a741adc173f062f1f9312c3a66c8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Jan 2021 12:07:22 -0800
Subject: HID: correct kernel-doc notation in <linux/hid*.h>

Correct kernel-doc notation in HID header files (include/linux/hid*.h).
Add notation (comments) where it is missing.
Use the documented "Return:" notation for function return values.
Fix a few typos/spellos.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid-sensor-hub.h |  9 +++++----
 include/linux/hid.h            | 15 +++++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 46bcef380446..763802b2b8f9 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -150,7 +150,7 @@ int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
 * @info:	return information about attribute after parsing report
 *
 * Parses report and returns the attribute information such as report id,
-* field index, units and exponet etc.
+* field index, units and exponent etc.
 */
 int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
 			u8 type,
@@ -167,7 +167,7 @@ int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
 * @is_signed:   If true then fields < 32 bits will be sign-extended
 *
 * Issues a synchronous or asynchronous read request for an input attribute.
-* Returns data upto 32 bits.
+* Return: data up to 32 bits.
 */
 
 enum sensor_hub_read_flags {
@@ -205,8 +205,9 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 * @buffer:	buffer to copy output
 *
 * Used to get a field in feature report. For example this can get polling
-* interval, sensitivity, activate/deactivate state. On success it returns
-* number of bytes copied to buffer. On failure, it returns value < 0.
+* interval, sensitivity, activate/deactivate state.
+* Return: On success, it returns the number of bytes copied to buffer.
+* On failure, it returns value < 0.
 */
 int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 			   u32 field_index, int buffer_size, void *buffer);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index c39d71eb1fd0..ef702b3f56e3 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -918,7 +918,7 @@ __u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
 /**
  * hid_device_io_start - enable HID input during probe, remove
  *
- * @hid - the device
+ * @hid: the device
  *
  * This should only be called during probe or remove and only be
  * called by the thread calling probe or remove. It will allow
@@ -936,7 +936,7 @@ static inline void hid_device_io_start(struct hid_device *hid) {
 /**
  * hid_device_io_stop - disable HID input during probe, remove
  *
- * @hid - the device
+ * @hid: the device
  *
  * Should only be called after hid_device_io_start. It will prevent
  * incoming packets from going to the driver for the duration of
@@ -1010,6 +1010,13 @@ static inline void hid_map_usage(struct hid_input *hidinput,
 /**
  * hid_map_usage_clear - map usage input bits and clear the input bit
  *
+ * @hidinput: hidinput which we are interested in
+ * @usage: usage to fill in
+ * @bit: pointer to input->{}bit (out parameter)
+ * @max: maximal valid usage->code to consider later (out parameter)
+ * @type: input event type (EV_KEY, EV_REL, ...)
+ * @c: code which corresponds to this usage and type
+ *
  * The same as hid_map_usage, except the @c bit is also cleared in supported
  * bits (@bit).
  */
@@ -1084,7 +1091,7 @@ static inline void hid_hw_request(struct hid_device *hdev,
  * @rtype: HID report type
  * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT
  *
- * @return: count of data transfered, negative if error
+ * Return: count of data transferred, negative if error
  *
  * Same behavior as hid_hw_request, but with raw buffers instead.
  */
@@ -1106,7 +1113,7 @@ static inline int hid_hw_raw_request(struct hid_device *hdev,
  * @buf: raw data to transfer
  * @len: length of buf
  *
- * @return: count of data transfered, negative if error
+ * Return: count of data transferred, negative if error
  */
 static inline int hid_hw_output_report(struct hid_device *hdev, __u8 *buf,
 					size_t len)
-- 
cgit v1.2.3


From 081df94301e317e84c3413686043987da2c3e39d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:23 -0600
Subject: objtool: Add asm version of STACK_FRAME_NON_STANDARD

To be used for adding asm functions to the ignore list.  The "aw" is
needed to help the ELF section metadata match GCC-created sections.
Otherwise the linker creates duplicate sections instead of combining
them.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/8faa476f9a5ac89af27944ec184c89f95f3c6c49.1611263462.git.jpoimboe@redhat.com
---
 include/linux/objtool.h       | 8 ++++++++
 tools/include/linux/objtool.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 577f51436cf9..add1c6eb157e 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -109,6 +109,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +128,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 577f51436cf9..add1c6eb157e 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -109,6 +109,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +128,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */
-- 
cgit v1.2.3


From b735bd3e68824316655252a931a3353a6ebc036f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:24 -0600
Subject: objtool: Combine UNWIND_HINT_RET_OFFSET and UNWIND_HINT_FUNC

The ORC metadata generated for UNWIND_HINT_FUNC isn't actually very
func-like.  With certain usages it can cause stack state mismatches
because it doesn't set the return address (CFI_RA).

Also, users of UNWIND_HINT_RET_OFFSET no longer need to set a custom
return stack offset.  Instead they just need to specify a func-like
situation, so the current ret_offset code is hacky for no good reason.

Solve both problems by simplifying the RET_OFFSET handling and
converting it into a more useful UNWIND_HINT_FUNC.

If we end up needing the old 'ret_offset' functionality again in the
future, we should be able to support it pretty easily with the addition
of a custom 'sp_offset' in UNWIND_HINT_FUNC.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/db9d1f5d79dddfbb3725ef6d8ec3477ad199948d.1611263462.git.jpoimboe@redhat.com
---
 arch/x86/include/asm/unwind_hints.h   | 13 ++----------
 arch/x86/kernel/ftrace_64.S           |  2 +-
 arch/x86/lib/retpoline.S              |  2 +-
 include/linux/objtool.h               |  5 ++++-
 tools/include/linux/objtool.h         |  5 ++++-
 tools/objtool/arch/x86/decode.c       |  4 ++--
 tools/objtool/check.c                 | 37 ++++++++++++++---------------------
 tools/objtool/include/objtool/check.h |  1 -
 8 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 664d4610d700..8e574c0afef8 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -48,17 +48,8 @@
 	UNWIND_HINT_REGS base=\base offset=\offset partial=1
 .endm
 
-.macro UNWIND_HINT_FUNC sp_offset=8
-	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
-.endm
-
-/*
- * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN
- * and sibling calls. On these, sp_offset denotes the expected offset from
- * initial_func_cfi.
- */
-.macro UNWIND_HINT_RET_OFFSET sp_offset=8
-	UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset
+.macro UNWIND_HINT_FUNC
+	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
 .endm
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 58d125ed9d1a..1bf568d901b1 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -277,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
 	restore_mcount_regs 8
 	/* Restore flags */
 	popfq
-	UNWIND_HINT_RET_OFFSET
+	UNWIND_HINT_FUNC
 	jmp	ftrace_epilogue
 
 SYM_FUNC_END(ftrace_regs_caller)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index b4c43a9b1483..f6fb1d218dcc 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -28,7 +28,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
 	jmp	.Lspec_trap_\@
 .Ldo_rop_\@:
 	mov	%\reg, (%_ASM_SP)
-	UNWIND_HINT_RET_OFFSET
+	UNWIND_HINT_FUNC
 	ret
 SYM_FUNC_END(__x86_retpoline_\reg)
 
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index add1c6eb157e..7e72d975cb76 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_RET_OFFSET	3
+#define UNWIND_HINT_TYPE_FUNC		3
 
 #ifdef CONFIG_STACK_VALIDATION
 
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index add1c6eb157e..7e72d975cb76 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_RET_OFFSET	3
+#define UNWIND_HINT_TYPE_FUNC		3
 
 #ifdef CONFIG_STACK_VALIDATION
 
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 6baa22732ca6..9637e3bf5ab8 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -563,8 +563,8 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state)
 	state->cfa.offset = 8;
 
 	/* initial RA (return address) */
-	state->regs[16].base = CFI_CFA;
-	state->regs[16].offset = -8;
+	state->regs[CFI_RA].base = CFI_CFA;
+	state->regs[CFI_RA].offset = -8;
 }
 
 const char *arch_nop_insn(int len)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b4e1655017de..f88f20327bf2 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1404,13 +1404,20 @@ static int add_jump_table_alts(struct objtool_file *file)
 	return 0;
 }
 
+static void set_func_state(struct cfi_state *state)
+{
+	state->cfa = initial_func_cfi.cfa;
+	memcpy(&state->regs, &initial_func_cfi.regs,
+	       CFI_NUM_REGS * sizeof(struct cfi_reg));
+	state->stack_size = initial_func_cfi.cfa.offset;
+}
+
 static int read_unwind_hints(struct objtool_file *file)
 {
 	struct section *sec, *relocsec;
 	struct reloc *reloc;
 	struct unwind_hint *hint;
 	struct instruction *insn;
-	struct cfi_reg *cfa;
 	int i;
 
 	sec = find_section_by_name(file->elf, ".discard.unwind_hints");
@@ -1445,22 +1452,20 @@ static int read_unwind_hints(struct objtool_file *file)
 			return -1;
 		}
 
-		cfa = &insn->cfi.cfa;
+		insn->hint = true;
 
-		if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) {
-			insn->ret_offset = bswap_if_needed(hint->sp_offset);
+		if (hint->type == UNWIND_HINT_TYPE_FUNC) {
+			set_func_state(&insn->cfi);
 			continue;
 		}
 
-		insn->hint = true;
-
 		if (arch_decode_hint_reg(insn, hint->sp_reg)) {
 			WARN_FUNC("unsupported unwind_hint sp base reg %d",
 				  insn->sec, insn->offset, hint->sp_reg);
 			return -1;
 		}
 
-		cfa->offset = bswap_if_needed(hint->sp_offset);
+		insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset);
 		insn->cfi.type = hint->type;
 		insn->cfi.end = hint->end;
 	}
@@ -1716,27 +1721,18 @@ static bool is_fentry_call(struct instruction *insn)
 
 static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state)
 {
-	u8 ret_offset = insn->ret_offset;
 	struct cfi_state *cfi = &state->cfi;
 	int i;
 
 	if (cfi->cfa.base != initial_func_cfi.cfa.base || cfi->drap)
 		return true;
 
-	if (cfi->cfa.offset != initial_func_cfi.cfa.offset + ret_offset)
+	if (cfi->cfa.offset != initial_func_cfi.cfa.offset)
 		return true;
 
-	if (cfi->stack_size != initial_func_cfi.cfa.offset + ret_offset)
+	if (cfi->stack_size != initial_func_cfi.cfa.offset)
 		return true;
 
-	/*
-	 * If there is a ret offset hint then don't check registers
-	 * because a callee-saved register might have been pushed on
-	 * the stack.
-	 */
-	if (ret_offset)
-		return false;
-
 	for (i = 0; i < CFI_NUM_REGS; i++) {
 		if (cfi->regs[i].base != initial_func_cfi.regs[i].base ||
 		    cfi->regs[i].offset != initial_func_cfi.regs[i].offset)
@@ -2880,10 +2876,7 @@ static int validate_section(struct objtool_file *file, struct section *sec)
 			continue;
 
 		init_insn_state(&state, sec);
-		state.cfi.cfa = initial_func_cfi.cfa;
-		memcpy(&state.cfi.regs, &initial_func_cfi.regs,
-		       CFI_NUM_REGS * sizeof(struct cfi_reg));
-		state.cfi.stack_size = initial_func_cfi.cfa.offset;
+		set_func_state(&state.cfi);
 
 		warnings += validate_symbol(file, sec, func, &state);
 	}
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index b408636c0201..4891ead0e85f 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -50,7 +50,6 @@ struct instruction {
 	bool retpoline_safe;
 	s8 instr;
 	u8 visited;
-	u8 ret_offset;
 	struct alt_group *alt_group;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
-- 
cgit v1.2.3


From d07b6621d948944c6828fc9b5cbdcb6f389b3b04 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 18 Jan 2021 09:15:48 -0300
Subject: dmaengine: imx-sdma: Remove platform data support

Since 5.10-rc1, i.MX has been converted to a devicetree-only platform.

The platform data support in this driver was only used for non-DT
platforms.

Remove the platform data support as it has no more users.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20210118121549.1625217-1-festevam@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                     | 35 +++++++++---------------------
 include/linux/platform_data/dma-imx-sdma.h | 11 ----------
 2 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 41ba21eea7c8..a68950f80635 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1961,7 +1961,6 @@ static int sdma_probe(struct platform_device *pdev)
 	int irq;
 	struct resource *iores;
 	struct resource spba_res;
-	struct sdma_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	int i;
 	struct sdma_engine *sdma;
 	s32 *saddr_arr;
@@ -2063,8 +2062,6 @@ static int sdma_probe(struct platform_device *pdev)
 
 	if (sdma->drvdata->script_addrs)
 		sdma_add_scripts(sdma, sdma->drvdata->script_addrs);
-	if (pdata && pdata->script_addrs)
-		sdma_add_scripts(sdma, pdata->script_addrs);
 
 	sdma->dma_device.dev = &pdev->dev;
 
@@ -2110,30 +2107,18 @@ static int sdma_probe(struct platform_device *pdev)
 	}
 
 	/*
-	 * Kick off firmware loading as the very last step:
-	 * attempt to load firmware only if we're not on the error path, because
-	 * the firmware callback requires a fully functional and allocated sdma
-	 * instance.
+	 * Because that device tree does not encode ROM script address,
+	 * the RAM script in firmware is mandatory for device tree
+	 * probe, otherwise it fails.
 	 */
-	if (pdata) {
-		ret = sdma_get_firmware(sdma, pdata->fw_name);
-		if (ret)
-			dev_warn(&pdev->dev, "failed to get firmware from platform data\n");
+	ret = of_property_read_string(np, "fsl,sdma-ram-script-name",
+				      &fw_name);
+	if (ret) {
+		dev_warn(&pdev->dev, "failed to get firmware name\n");
 	} else {
-		/*
-		 * Because that device tree does not encode ROM script address,
-		 * the RAM script in firmware is mandatory for device tree
-		 * probe, otherwise it fails.
-		 */
-		ret = of_property_read_string(np, "fsl,sdma-ram-script-name",
-					      &fw_name);
-		if (ret) {
-			dev_warn(&pdev->dev, "failed to get firmware name\n");
-		} else {
-			ret = sdma_get_firmware(sdma, fw_name);
-			if (ret)
-				dev_warn(&pdev->dev, "failed to get firmware from device tree\n");
-		}
+		ret = sdma_get_firmware(sdma, fw_name);
+		if (ret)
+			dev_warn(&pdev->dev, "failed to get firmware from device tree\n");
 	}
 
 	return 0;
diff --git a/include/linux/platform_data/dma-imx-sdma.h b/include/linux/platform_data/dma-imx-sdma.h
index 30e676b36b24..725602d9df91 100644
--- a/include/linux/platform_data/dma-imx-sdma.h
+++ b/include/linux/platform_data/dma-imx-sdma.h
@@ -57,15 +57,4 @@ struct sdma_script_start_addrs {
 	/* End of v4 array */
 };
 
-/**
- * struct sdma_platform_data - platform specific data for SDMA engine
- *
- * @fw_name		The firmware name
- * @script_addrs	SDMA scripts addresses in SDMA ROM
- */
-struct sdma_platform_data {
-	char *fw_name;
-	struct sdma_script_start_addrs *script_addrs;
-};
-
 #endif /* __MACH_MXC_SDMA_H__ */
-- 
cgit v1.2.3


From ec6ab42f5aadd765b0b8c4e2d21508ac1e20f2ed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:18:57 +0100
Subject: dmaengine: remove sirfsoc driver

The CSR SiRF prima2/atlas platforms are getting removed, so this driver
is no longer needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Barry Song <baohua@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Link: https://lore.kernel.org/r/20210120131859.2056308-2-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/sirfsoc-dma.txt        |   44 -
 drivers/dma/Kconfig                                |    7 -
 drivers/dma/Makefile                               |    1 -
 drivers/dma/sirf-dma.c                             | 1170 --------------------
 include/linux/sirfsoc_dma.h                        |    7 -
 5 files changed, 1229 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
 delete mode 100644 drivers/dma/sirf-dma.c
 delete mode 100644 include/linux/sirfsoc_dma.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
deleted file mode 100644
index ccd52d6a231a..000000000000
--- a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-* CSR SiRFSoC DMA controller
-
-See dma.txt first
-
-Required properties:
-- compatible: Should be "sirf,prima2-dmac", "sirf,atlas7-dmac" or
-  "sirf,atlas7-dmac-v2"
-- reg: Should contain DMA registers location and length.
-- interrupts: Should contain one interrupt shared by all channel
-- #dma-cells: must be <1>. used to represent the number of integer
-    cells in the dmas property of client device.
-- clocks: clock required
-
-Example:
-
-Controller:
-dmac0: dma-controller@b00b0000 {
-	compatible = "sirf,prima2-dmac";
-	reg = <0xb00b0000 0x10000>;
-	interrupts = <12>;
-	clocks = <&clks 24>;
-	#dma-cells = <1>;
-};
-
-
-Client:
-Fill the specific dma request line in dmas. In the below example, spi0 read
-channel request line is 9 of the 2nd dma controller, while write channel uses
-4 of the 2nd dma controller; spi1 read channel request line is 12 of the 1st
-dma controller, while write channel uses 13 of the 1st dma controller:
-
-spi0: spi@b00d0000 {
-	compatible = "sirf,prima2-spi";
-	dmas = <&dmac1 9>,
-		<&dmac1 4>;
-	dma-names = "rx", "tx";
-};
-
-spi1: spi@b0170000 {
-	compatible = "sirf,prima2-spi";
-	dmas = <&dmac0 12>,
-		<&dmac0 13>;
-	dma-names = "rx", "tx";
-};
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 6c92f075f7ce..2201ff280f7a 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -519,13 +519,6 @@ config PLX_DMA
 	  These are exposed via extra functions on the switch's
 	  upstream port. Each function exposes one DMA channel.
 
-config SIRF_DMA
-	tristate "CSR SiRFprimaII/SiRFmarco DMA support"
-	depends on ARCH_SIRF
-	select DMA_ENGINE
-	help
-	  Enable support for the CSR SiRFprimaII DMA engine.
-
 config STE_DMA40
 	bool "ST-Ericsson DMA40 support"
 	depends on ARCH_U8500
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 649a4f95ea4b..b5b34c65b075 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -65,7 +65,6 @@ obj-$(CONFIG_PPC_BESTCOMM) += bestcomm/
 obj-$(CONFIG_PXA_DMA) += pxa_dma.o
 obj-$(CONFIG_RENESAS_DMA) += sh/
 obj-$(CONFIG_SF_PDMA) += sf-pdma/
-obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_STM32_DMA) += stm32-dma.o
 obj-$(CONFIG_STM32_DMAMUX) += stm32-dmamux.o
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
deleted file mode 100644
index a5c2843384fd..000000000000
--- a/drivers/dma/sirf-dma.c
+++ /dev/null
@@ -1,1170 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DMA controller driver for CSR SiRFprimaII
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/module.h>
-#include <linux/dmaengine.h>
-#include <linux/dma-mapping.h>
-#include <linux/pm_runtime.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/of_irq.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
-#include <linux/of_platform.h>
-#include <linux/clk.h>
-#include <linux/of_dma.h>
-#include <linux/sirfsoc_dma.h>
-
-#include "dmaengine.h"
-
-#define SIRFSOC_DMA_VER_A7V1                    1
-#define SIRFSOC_DMA_VER_A7V2                    2
-#define SIRFSOC_DMA_VER_A6                      4
-
-#define SIRFSOC_DMA_DESCRIPTORS                 16
-#define SIRFSOC_DMA_CHANNELS                    16
-#define SIRFSOC_DMA_TABLE_NUM                   256
-
-#define SIRFSOC_DMA_CH_ADDR                     0x00
-#define SIRFSOC_DMA_CH_XLEN                     0x04
-#define SIRFSOC_DMA_CH_YLEN                     0x08
-#define SIRFSOC_DMA_CH_CTRL                     0x0C
-
-#define SIRFSOC_DMA_WIDTH_0                     0x100
-#define SIRFSOC_DMA_CH_VALID                    0x140
-#define SIRFSOC_DMA_CH_INT                      0x144
-#define SIRFSOC_DMA_INT_EN                      0x148
-#define SIRFSOC_DMA_INT_EN_CLR                  0x14C
-#define SIRFSOC_DMA_CH_LOOP_CTRL                0x150
-#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR            0x154
-#define SIRFSOC_DMA_WIDTH_ATLAS7                0x10
-#define SIRFSOC_DMA_VALID_ATLAS7                0x14
-#define SIRFSOC_DMA_INT_ATLAS7                  0x18
-#define SIRFSOC_DMA_INT_EN_ATLAS7               0x1c
-#define SIRFSOC_DMA_LOOP_CTRL_ATLAS7            0x20
-#define SIRFSOC_DMA_CUR_DATA_ADDR               0x34
-#define SIRFSOC_DMA_MUL_ATLAS7                  0x38
-#define SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7         0x158
-#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7     0x15C
-#define SIRFSOC_DMA_IOBG_SCMD_EN		0x800
-#define SIRFSOC_DMA_EARLY_RESP_SET		0x818
-#define SIRFSOC_DMA_EARLY_RESP_CLR		0x81C
-
-#define SIRFSOC_DMA_MODE_CTRL_BIT               4
-#define SIRFSOC_DMA_DIR_CTRL_BIT                5
-#define SIRFSOC_DMA_MODE_CTRL_BIT_ATLAS7        2
-#define SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7       3
-#define SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7         4
-#define SIRFSOC_DMA_TAB_NUM_ATLAS7              7
-#define SIRFSOC_DMA_CHAIN_INT_BIT_ATLAS7        5
-#define SIRFSOC_DMA_CHAIN_FLAG_SHIFT_ATLAS7     25
-#define SIRFSOC_DMA_CHAIN_ADDR_SHIFT            32
-
-#define SIRFSOC_DMA_INT_FINI_INT_ATLAS7         BIT(0)
-#define SIRFSOC_DMA_INT_CNT_INT_ATLAS7          BIT(1)
-#define SIRFSOC_DMA_INT_PAU_INT_ATLAS7          BIT(2)
-#define SIRFSOC_DMA_INT_LOOP_INT_ATLAS7         BIT(3)
-#define SIRFSOC_DMA_INT_INV_INT_ATLAS7          BIT(4)
-#define SIRFSOC_DMA_INT_END_INT_ATLAS7          BIT(5)
-#define SIRFSOC_DMA_INT_ALL_ATLAS7              0x3F
-
-/* xlen and dma_width register is in 4 bytes boundary */
-#define SIRFSOC_DMA_WORD_LEN			4
-#define SIRFSOC_DMA_XLEN_MAX_V1         0x800
-#define SIRFSOC_DMA_XLEN_MAX_V2         0x1000
-
-struct sirfsoc_dma_desc {
-	struct dma_async_tx_descriptor	desc;
-	struct list_head		node;
-
-	/* SiRFprimaII 2D-DMA parameters */
-
-	int             xlen;           /* DMA xlen */
-	int             ylen;           /* DMA ylen */
-	int             width;          /* DMA width */
-	int             dir;
-	bool            cyclic;         /* is loop DMA? */
-	bool            chain;          /* is chain DMA? */
-	u32             addr;		/* DMA buffer address */
-	u64 chain_table[SIRFSOC_DMA_TABLE_NUM]; /* chain tbl */
-};
-
-struct sirfsoc_dma_chan {
-	struct dma_chan			chan;
-	struct list_head		free;
-	struct list_head		prepared;
-	struct list_head		queued;
-	struct list_head		active;
-	struct list_head		completed;
-	unsigned long			happened_cyclic;
-	unsigned long			completed_cyclic;
-
-	/* Lock for this structure */
-	spinlock_t			lock;
-
-	int				mode;
-};
-
-struct sirfsoc_dma_regs {
-	u32				ctrl[SIRFSOC_DMA_CHANNELS];
-	u32				interrupt_en;
-};
-
-struct sirfsoc_dma {
-	struct dma_device		dma;
-	struct tasklet_struct		tasklet;
-	struct sirfsoc_dma_chan		channels[SIRFSOC_DMA_CHANNELS];
-	void __iomem			*base;
-	int				irq;
-	struct clk			*clk;
-	int				type;
-	void (*exec_desc)(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base);
-	struct sirfsoc_dma_regs		regs_save;
-};
-
-struct sirfsoc_dmadata {
-	void (*exec)(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base);
-	int type;
-};
-
-enum sirfsoc_dma_chain_flag {
-	SIRFSOC_DMA_CHAIN_NORMAL = 0x01,
-	SIRFSOC_DMA_CHAIN_PAUSE = 0x02,
-	SIRFSOC_DMA_CHAIN_LOOP = 0x03,
-	SIRFSOC_DMA_CHAIN_END = 0x04
-};
-
-#define DRV_NAME	"sirfsoc_dma"
-
-static int sirfsoc_dma_runtime_suspend(struct device *dev);
-
-/* Convert struct dma_chan to struct sirfsoc_dma_chan */
-static inline
-struct sirfsoc_dma_chan *dma_chan_to_sirfsoc_dma_chan(struct dma_chan *c)
-{
-	return container_of(c, struct sirfsoc_dma_chan, chan);
-}
-
-/* Convert struct dma_chan to struct sirfsoc_dma */
-static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(c);
-	return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]);
-}
-
-static void sirfsoc_dma_execute_hw_a7v2(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	if (sdesc->chain) {
-		/* DMA v2 HW chain mode */
-		writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
-			       (sdesc->chain <<
-				SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
-			       (0x8 << SIRFSOC_DMA_TAB_NUM_ATLAS7) | 0x3,
-			       base + SIRFSOC_DMA_CH_CTRL);
-	} else {
-		/* DMA v2 legacy mode */
-		writel_relaxed(sdesc->xlen, base + SIRFSOC_DMA_CH_XLEN);
-		writel_relaxed(sdesc->ylen, base + SIRFSOC_DMA_CH_YLEN);
-		writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_ATLAS7);
-		writel_relaxed((sdesc->width*((sdesc->ylen+1)>>1)),
-				base + SIRFSOC_DMA_MUL_ATLAS7);
-		writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
-			       (sdesc->chain <<
-				SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
-			       0x3, base + SIRFSOC_DMA_CH_CTRL);
-	}
-	writel_relaxed(sdesc->chain ? SIRFSOC_DMA_INT_END_INT_ATLAS7 :
-		       (SIRFSOC_DMA_INT_FINI_INT_ATLAS7 |
-			SIRFSOC_DMA_INT_LOOP_INT_ATLAS7),
-		       base + SIRFSOC_DMA_INT_EN_ATLAS7);
-	writel(sdesc->addr, base + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic)
-		writel(0x10001, base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-}
-
-static void sirfsoc_dma_execute_hw_a7v1(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	writel_relaxed(1, base + SIRFSOC_DMA_IOBG_SCMD_EN);
-	writel_relaxed((1 << cid), base + SIRFSOC_DMA_EARLY_RESP_SET);
-	writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
-	writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
-		       (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
-		       base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
-	writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
-	writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
-		       (1 << cid), base + SIRFSOC_DMA_INT_EN);
-	writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic) {
-		writel((1 << cid) | 1 << (cid + 16) |
-		       readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7),
-		       base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
-	}
-
-}
-
-static void sirfsoc_dma_execute_hw_a6(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
-	writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
-		       (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
-		       base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
-	writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
-	writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
-		       (1 << cid), base + SIRFSOC_DMA_INT_EN);
-	writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic) {
-		writel((1 << cid) | 1 << (cid + 16) |
-		       readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL),
-		       base + SIRFSOC_DMA_CH_LOOP_CTRL);
-	}
-
-}
-
-/* Execute all queued DMA descriptors */
-static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	void __iomem *base;
-
-	/*
-	 * lock has been held by functions calling this, so we don't hold
-	 * lock again
-	 */
-	base = sdma->base;
-	sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc,
-				 node);
-	/* Move the first queued descriptor to active list */
-	list_move_tail(&sdesc->node, &schan->active);
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2)
-		cid = 0;
-
-	/* Start the DMA transfer */
-	sdma->exec_desc(sdesc, cid, schan->mode, base);
-
-	if (sdesc->cyclic)
-		schan->happened_cyclic = schan->completed_cyclic = 0;
-}
-
-/* Interrupt handler */
-static irqreturn_t sirfsoc_dma_irq(int irq, void *data)
-{
-	struct sirfsoc_dma *sdma = data;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	u32 is;
-	bool chain;
-	int ch;
-	void __iomem *reg;
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A6:
-	case SIRFSOC_DMA_VER_A7V1:
-		is = readl(sdma->base + SIRFSOC_DMA_CH_INT);
-		reg = sdma->base + SIRFSOC_DMA_CH_INT;
-		while ((ch = fls(is) - 1) >= 0) {
-			is &= ~(1 << ch);
-			writel_relaxed(1 << ch, reg);
-			schan = &sdma->channels[ch];
-			spin_lock(&schan->lock);
-			sdesc = list_first_entry(&schan->active,
-						 struct sirfsoc_dma_desc, node);
-			if (!sdesc->cyclic) {
-				/* Execute queued descriptors */
-				list_splice_tail_init(&schan->active,
-						      &schan->completed);
-				dma_cookie_complete(&sdesc->desc);
-				if (!list_empty(&schan->queued))
-					sirfsoc_dma_execute(schan);
-			} else
-				schan->happened_cyclic++;
-			spin_unlock(&schan->lock);
-		}
-		break;
-
-	case SIRFSOC_DMA_VER_A7V2:
-		is = readl(sdma->base + SIRFSOC_DMA_INT_ATLAS7);
-
-		reg = sdma->base + SIRFSOC_DMA_INT_ATLAS7;
-		writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7, reg);
-		schan = &sdma->channels[0];
-		spin_lock(&schan->lock);
-		sdesc = list_first_entry(&schan->active,
-					 struct sirfsoc_dma_desc, node);
-		if (!sdesc->cyclic) {
-			chain = sdesc->chain;
-			if ((chain && (is & SIRFSOC_DMA_INT_END_INT_ATLAS7)) ||
-				(!chain &&
-				(is & SIRFSOC_DMA_INT_FINI_INT_ATLAS7))) {
-				/* Execute queued descriptors */
-				list_splice_tail_init(&schan->active,
-						      &schan->completed);
-				dma_cookie_complete(&sdesc->desc);
-				if (!list_empty(&schan->queued))
-					sirfsoc_dma_execute(schan);
-			}
-		} else if (sdesc->cyclic && (is &
-					SIRFSOC_DMA_INT_LOOP_INT_ATLAS7))
-			schan->happened_cyclic++;
-
-		spin_unlock(&schan->lock);
-		break;
-
-	default:
-		break;
-	}
-
-	/* Schedule tasklet */
-	tasklet_schedule(&sdma->tasklet);
-
-	return IRQ_HANDLED;
-}
-
-/* process completed descriptors */
-static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma)
-{
-	dma_cookie_t last_cookie = 0;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dma_desc *sdesc;
-	struct dma_async_tx_descriptor *desc;
-	unsigned long flags;
-	unsigned long happened_cyclic;
-	LIST_HEAD(list);
-	int i;
-
-	for (i = 0; i < sdma->dma.chancnt; i++) {
-		schan = &sdma->channels[i];
-
-		/* Get all completed descriptors */
-		spin_lock_irqsave(&schan->lock, flags);
-		if (!list_empty(&schan->completed)) {
-			list_splice_tail_init(&schan->completed, &list);
-			spin_unlock_irqrestore(&schan->lock, flags);
-
-			/* Execute callbacks and run dependencies */
-			list_for_each_entry(sdesc, &list, node) {
-				desc = &sdesc->desc;
-
-				dmaengine_desc_get_callback_invoke(desc, NULL);
-				last_cookie = desc->cookie;
-				dma_run_dependencies(desc);
-			}
-
-			/* Free descriptors */
-			spin_lock_irqsave(&schan->lock, flags);
-			list_splice_tail_init(&list, &schan->free);
-			schan->chan.completed_cookie = last_cookie;
-			spin_unlock_irqrestore(&schan->lock, flags);
-		} else {
-			if (list_empty(&schan->active)) {
-				spin_unlock_irqrestore(&schan->lock, flags);
-				continue;
-			}
-
-			/* for cyclic channel, desc is always in active list */
-			sdesc = list_first_entry(&schan->active,
-				struct sirfsoc_dma_desc, node);
-
-			/* cyclic DMA */
-			happened_cyclic = schan->happened_cyclic;
-			spin_unlock_irqrestore(&schan->lock, flags);
-
-			desc = &sdesc->desc;
-			while (happened_cyclic != schan->completed_cyclic) {
-				dmaengine_desc_get_callback_invoke(desc, NULL);
-				schan->completed_cyclic++;
-			}
-		}
-	}
-}
-
-/* DMA Tasklet */
-static void sirfsoc_dma_tasklet(struct tasklet_struct *t)
-{
-	struct sirfsoc_dma *sdma = from_tasklet(sdma, t, tasklet);
-
-	sirfsoc_dma_process_completed(sdma);
-}
-
-/* Submit descriptor to hardware */
-static dma_cookie_t sirfsoc_dma_tx_submit(struct dma_async_tx_descriptor *txd)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(txd->chan);
-	struct sirfsoc_dma_desc *sdesc;
-	unsigned long flags;
-	dma_cookie_t cookie;
-
-	sdesc = container_of(txd, struct sirfsoc_dma_desc, desc);
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	/* Move descriptor to queue */
-	list_move_tail(&sdesc->node, &schan->queued);
-
-	cookie = dma_cookie_assign(txd);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return cookie;
-}
-
-static int sirfsoc_dma_slave_config(struct dma_chan *chan,
-				    struct dma_slave_config *config)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-
-	if ((config->src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES) ||
-		(config->dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES))
-		return -EINVAL;
-
-	spin_lock_irqsave(&schan->lock, flags);
-	schan->mode = (config->src_maxburst == 4 ? 1 : 0);
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_terminate_all(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_INT_EN_CLR);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_INT);
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base +
-			       SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_INT_EN_ATLAS7);
-		writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7,
-			       sdma->base + SIRFSOC_DMA_INT_ATLAS7);
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_VALID_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) &
-			       ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) &
-			       ~((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
-		break;
-	default:
-		break;
-	}
-
-	list_splice_tail_init(&schan->active, &schan->free);
-	list_splice_tail_init(&schan->queued, &schan->free);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_pause_chan(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base +
-			       SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) &
-			       ~((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		break;
-
-	default:
-		break;
-	}
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_resume_chan(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0x10001,
-			       sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) |
-			       ((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		break;
-
-	default:
-		break;
-	}
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-/* Alloc channel resources */
-static int sirfsoc_dma_alloc_chan_resources(struct dma_chan *chan)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc;
-	unsigned long flags;
-	LIST_HEAD(descs);
-	int i;
-
-	pm_runtime_get_sync(sdma->dma.dev);
-
-	/* Alloc descriptors for this channel */
-	for (i = 0; i < SIRFSOC_DMA_DESCRIPTORS; i++) {
-		sdesc = kzalloc(sizeof(*sdesc), GFP_KERNEL);
-		if (!sdesc) {
-			dev_notice(sdma->dma.dev, "Memory allocation error. "
-				"Allocated only %u descriptors\n", i);
-			break;
-		}
-
-		dma_async_tx_descriptor_init(&sdesc->desc, chan);
-		sdesc->desc.flags = DMA_CTRL_ACK;
-		sdesc->desc.tx_submit = sirfsoc_dma_tx_submit;
-
-		list_add_tail(&sdesc->node, &descs);
-	}
-
-	/* Return error only if no descriptors were allocated */
-	if (i == 0)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	list_splice_tail_init(&descs, &schan->free);
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return i;
-}
-
-/* Free channel resources */
-static void sirfsoc_dma_free_chan_resources(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_desc *sdesc, *tmp;
-	unsigned long flags;
-	LIST_HEAD(descs);
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	/* Channel must be idle */
-	BUG_ON(!list_empty(&schan->prepared));
-	BUG_ON(!list_empty(&schan->queued));
-	BUG_ON(!list_empty(&schan->active));
-	BUG_ON(!list_empty(&schan->completed));
-
-	/* Move data */
-	list_splice_tail_init(&schan->free, &descs);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	/* Free descriptors */
-	list_for_each_entry_safe(sdesc, tmp, &descs, node)
-		kfree(sdesc);
-
-	pm_runtime_put(sdma->dma.dev);
-}
-
-/* Send pending descriptor to hardware */
-static void sirfsoc_dma_issue_pending(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	if (list_empty(&schan->active) && !list_empty(&schan->queued))
-		sirfsoc_dma_execute(schan);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-}
-
-/* Check request completion status */
-static enum dma_status
-sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
-	struct dma_tx_state *txstate)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-	enum dma_status ret;
-	struct sirfsoc_dma_desc *sdesc;
-	int cid = schan->chan.chan_id;
-	unsigned long dma_pos;
-	unsigned long dma_request_bytes;
-	unsigned long residue;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	if (list_empty(&schan->active)) {
-		ret = dma_cookie_status(chan, cookie, txstate);
-		dma_set_residue(txstate, 0);
-		spin_unlock_irqrestore(&schan->lock, flags);
-		return ret;
-	}
-	sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, node);
-	if (sdesc->cyclic)
-		dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) *
-			(sdesc->width * SIRFSOC_DMA_WORD_LEN);
-	else
-		dma_request_bytes = sdesc->xlen * SIRFSOC_DMA_WORD_LEN;
-
-	ret = dma_cookie_status(chan, cookie, txstate);
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2)
-		cid = 0;
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		dma_pos = readl_relaxed(sdma->base + SIRFSOC_DMA_CUR_DATA_ADDR);
-	} else {
-		dma_pos = readl_relaxed(
-			sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR) << 2;
-	}
-
-	residue = dma_request_bytes - (dma_pos - sdesc->addr);
-	dma_set_residue(txstate, residue);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return ret;
-}
-
-static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
-	struct dma_chan *chan, struct dma_interleaved_template *xt,
-	unsigned long flags)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	unsigned long iflags;
-	int ret;
-
-	if ((xt->dir != DMA_MEM_TO_DEV) && (xt->dir != DMA_DEV_TO_MEM)) {
-		ret = -EINVAL;
-		goto err_dir;
-	}
-
-	/* Get free descriptor */
-	spin_lock_irqsave(&schan->lock, iflags);
-	if (!list_empty(&schan->free)) {
-		sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
-			node);
-		list_del(&sdesc->node);
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	if (!sdesc) {
-		/* try to free completed descriptors */
-		sirfsoc_dma_process_completed(sdma);
-		ret = 0;
-		goto no_desc;
-	}
-
-	/* Place descriptor in prepared list */
-	spin_lock_irqsave(&schan->lock, iflags);
-
-	/*
-	 * Number of chunks in a frame can only be 1 for prima2
-	 * and ylen (number of frame - 1) must be at least 0
-	 */
-	if ((xt->frame_size == 1) && (xt->numf > 0)) {
-		sdesc->cyclic = 0;
-		sdesc->xlen = xt->sgl[0].size / SIRFSOC_DMA_WORD_LEN;
-		sdesc->width = (xt->sgl[0].size + xt->sgl[0].icg) /
-				SIRFSOC_DMA_WORD_LEN;
-		sdesc->ylen = xt->numf - 1;
-		if (xt->dir == DMA_MEM_TO_DEV) {
-			sdesc->addr = xt->src_start;
-			sdesc->dir = 1;
-		} else {
-			sdesc->addr = xt->dst_start;
-			sdesc->dir = 0;
-		}
-
-		list_add_tail(&sdesc->node, &schan->prepared);
-	} else {
-		pr_err("sirfsoc DMA Invalid xfer\n");
-		ret = -EINVAL;
-		goto err_xfer;
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	return &sdesc->desc;
-err_xfer:
-	spin_unlock_irqrestore(&schan->lock, iflags);
-no_desc:
-err_dir:
-	return ERR_PTR(ret);
-}
-
-static struct dma_async_tx_descriptor *
-sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr,
-	size_t buf_len, size_t period_len,
-	enum dma_transfer_direction direction, unsigned long flags)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	unsigned long iflags;
-
-	/*
-	 * we only support cycle transfer with 2 period
-	 * If the X-length is set to 0, it would be the loop mode.
-	 * The DMA address keeps increasing until reaching the end of a loop
-	 * area whose size is defined by (DMA_WIDTH x (Y_LENGTH + 1)). Then
-	 * the DMA address goes back to the beginning of this area.
-	 * In loop mode, the DMA data region is divided into two parts, BUFA
-	 * and BUFB. DMA controller generates interrupts twice in each loop:
-	 * when the DMA address reaches the end of BUFA or the end of the
-	 * BUFB
-	 */
-	if (buf_len !=  2 * period_len)
-		return ERR_PTR(-EINVAL);
-
-	/* Get free descriptor */
-	spin_lock_irqsave(&schan->lock, iflags);
-	if (!list_empty(&schan->free)) {
-		sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
-			node);
-		list_del(&sdesc->node);
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	if (!sdesc)
-		return NULL;
-
-	/* Place descriptor in prepared list */
-	spin_lock_irqsave(&schan->lock, iflags);
-	sdesc->addr = addr;
-	sdesc->cyclic = 1;
-	sdesc->xlen = 0;
-	sdesc->ylen = buf_len / SIRFSOC_DMA_WORD_LEN - 1;
-	sdesc->width = 1;
-	list_add_tail(&sdesc->node, &schan->prepared);
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	return &sdesc->desc;
-}
-
-/*
- * The DMA controller consists of 16 independent DMA channels.
- * Each channel is allocated to a different function
- */
-bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	unsigned int ch_nr = (unsigned int) chan_id;
-
-	if (ch_nr == chan->chan_id +
-		chan->device->dev_id * SIRFSOC_DMA_CHANNELS)
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(sirfsoc_dma_filter_id);
-
-#define SIRFSOC_DMA_BUSWIDTHS \
-	(BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) | \
-	BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
-	BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \
-	BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | \
-	BIT(DMA_SLAVE_BUSWIDTH_8_BYTES))
-
-static struct dma_chan *of_dma_sirfsoc_xlate(struct of_phandle_args *dma_spec,
-	struct of_dma *ofdma)
-{
-	struct sirfsoc_dma *sdma = ofdma->of_dma_data;
-	unsigned int request = dma_spec->args[0];
-
-	if (request >= SIRFSOC_DMA_CHANNELS)
-		return NULL;
-
-	return dma_get_slave_channel(&sdma->channels[request].chan);
-}
-
-static int sirfsoc_dma_probe(struct platform_device *op)
-{
-	struct device_node *dn = op->dev.of_node;
-	struct device *dev = &op->dev;
-	struct dma_device *dma;
-	struct sirfsoc_dma *sdma;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dmadata *data;
-	struct resource res;
-	ulong regs_start, regs_size;
-	u32 id;
-	int ret, i;
-
-	sdma = devm_kzalloc(dev, sizeof(*sdma), GFP_KERNEL);
-	if (!sdma)
-		return -ENOMEM;
-
-	data = (struct sirfsoc_dmadata *)
-		(of_match_device(op->dev.driver->of_match_table,
-				 &op->dev)->data);
-	sdma->exec_desc = data->exec;
-	sdma->type = data->type;
-
-	if (of_property_read_u32(dn, "cell-index", &id)) {
-		dev_err(dev, "Fail to get DMAC index\n");
-		return -ENODEV;
-	}
-
-	sdma->irq = irq_of_parse_and_map(dn, 0);
-	if (!sdma->irq) {
-		dev_err(dev, "Error mapping IRQ!\n");
-		return -EINVAL;
-	}
-
-	sdma->clk = devm_clk_get(dev, NULL);
-	if (IS_ERR(sdma->clk)) {
-		dev_err(dev, "failed to get a clock.\n");
-		return PTR_ERR(sdma->clk);
-	}
-
-	ret = of_address_to_resource(dn, 0, &res);
-	if (ret) {
-		dev_err(dev, "Error parsing memory region!\n");
-		goto irq_dispose;
-	}
-
-	regs_start = res.start;
-	regs_size = resource_size(&res);
-
-	sdma->base = devm_ioremap(dev, regs_start, regs_size);
-	if (!sdma->base) {
-		dev_err(dev, "Error mapping memory region!\n");
-		ret = -ENOMEM;
-		goto irq_dispose;
-	}
-
-	ret = request_irq(sdma->irq, &sirfsoc_dma_irq, 0, DRV_NAME, sdma);
-	if (ret) {
-		dev_err(dev, "Error requesting IRQ!\n");
-		ret = -EINVAL;
-		goto irq_dispose;
-	}
-
-	dma = &sdma->dma;
-	dma->dev = dev;
-
-	dma->device_alloc_chan_resources = sirfsoc_dma_alloc_chan_resources;
-	dma->device_free_chan_resources = sirfsoc_dma_free_chan_resources;
-	dma->device_issue_pending = sirfsoc_dma_issue_pending;
-	dma->device_config = sirfsoc_dma_slave_config;
-	dma->device_pause = sirfsoc_dma_pause_chan;
-	dma->device_resume = sirfsoc_dma_resume_chan;
-	dma->device_terminate_all = sirfsoc_dma_terminate_all;
-	dma->device_tx_status = sirfsoc_dma_tx_status;
-	dma->device_prep_interleaved_dma = sirfsoc_dma_prep_interleaved;
-	dma->device_prep_dma_cyclic = sirfsoc_dma_prep_cyclic;
-	dma->src_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
-	dma->dst_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
-	dma->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-
-	INIT_LIST_HEAD(&dma->channels);
-	dma_cap_set(DMA_SLAVE, dma->cap_mask);
-	dma_cap_set(DMA_CYCLIC, dma->cap_mask);
-	dma_cap_set(DMA_INTERLEAVE, dma->cap_mask);
-	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
-
-	for (i = 0; i < SIRFSOC_DMA_CHANNELS; i++) {
-		schan = &sdma->channels[i];
-
-		schan->chan.device = dma;
-		dma_cookie_init(&schan->chan);
-
-		INIT_LIST_HEAD(&schan->free);
-		INIT_LIST_HEAD(&schan->prepared);
-		INIT_LIST_HEAD(&schan->queued);
-		INIT_LIST_HEAD(&schan->active);
-		INIT_LIST_HEAD(&schan->completed);
-
-		spin_lock_init(&schan->lock);
-		list_add_tail(&schan->chan.device_node, &dma->channels);
-	}
-
-	tasklet_setup(&sdma->tasklet, sirfsoc_dma_tasklet);
-
-	/* Register DMA engine */
-	dev_set_drvdata(dev, sdma);
-
-	ret = dma_async_device_register(dma);
-	if (ret)
-		goto free_irq;
-
-	/* Device-tree DMA controller registration */
-	ret = of_dma_controller_register(dn, of_dma_sirfsoc_xlate, sdma);
-	if (ret) {
-		dev_err(dev, "failed to register DMA controller\n");
-		goto unreg_dma_dev;
-	}
-
-	pm_runtime_enable(&op->dev);
-	dev_info(dev, "initialized SIRFSOC DMAC driver\n");
-
-	return 0;
-
-unreg_dma_dev:
-	dma_async_device_unregister(dma);
-free_irq:
-	free_irq(sdma->irq, sdma);
-irq_dispose:
-	irq_dispose_mapping(sdma->irq);
-	return ret;
-}
-
-static int sirfsoc_dma_remove(struct platform_device *op)
-{
-	struct device *dev = &op->dev;
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-
-	of_dma_controller_free(op->dev.of_node);
-	dma_async_device_unregister(&sdma->dma);
-	free_irq(sdma->irq, sdma);
-	tasklet_kill(&sdma->tasklet);
-	irq_dispose_mapping(sdma->irq);
-	pm_runtime_disable(&op->dev);
-	if (!pm_runtime_status_suspended(&op->dev))
-		sirfsoc_dma_runtime_suspend(&op->dev);
-
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_runtime_suspend(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-
-	clk_disable_unprepare(sdma->clk);
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_runtime_resume(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	int ret;
-
-	ret = clk_prepare_enable(sdma->clk);
-	if (ret < 0) {
-		dev_err(dev, "clk_enable failed: %d\n", ret);
-		return ret;
-	}
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_pm_suspend(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	struct sirfsoc_dma_regs *save = &sdma->regs_save;
-	struct sirfsoc_dma_chan *schan;
-	int ch;
-	int ret;
-	int count;
-	u32 int_offset;
-
-	/*
-	 * if we were runtime-suspended before, resume to enable clock
-	 * before accessing register
-	 */
-	if (pm_runtime_status_suspended(dev)) {
-		ret = sirfsoc_dma_runtime_resume(dev);
-		if (ret < 0)
-			return ret;
-	}
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		count = 1;
-		int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
-	} else {
-		count = SIRFSOC_DMA_CHANNELS;
-		int_offset = SIRFSOC_DMA_INT_EN;
-	}
-
-	/*
-	 * DMA controller will lose all registers while suspending
-	 * so we need to save registers for active channels
-	 */
-	for (ch = 0; ch < count; ch++) {
-		schan = &sdma->channels[ch];
-		if (list_empty(&schan->active))
-			continue;
-		save->ctrl[ch] = readl_relaxed(sdma->base +
-			ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	}
-	save->interrupt_en = readl_relaxed(sdma->base + int_offset);
-
-	/* Disable clock */
-	sirfsoc_dma_runtime_suspend(dev);
-
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_pm_resume(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	struct sirfsoc_dma_regs *save = &sdma->regs_save;
-	struct sirfsoc_dma_desc *sdesc;
-	struct sirfsoc_dma_chan *schan;
-	int ch;
-	int ret;
-	int count;
-	u32 int_offset;
-	u32 width_offset;
-
-	/* Enable clock before accessing register */
-	ret = sirfsoc_dma_runtime_resume(dev);
-	if (ret < 0)
-		return ret;
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		count = 1;
-		int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
-		width_offset = SIRFSOC_DMA_WIDTH_ATLAS7;
-	} else {
-		count = SIRFSOC_DMA_CHANNELS;
-		int_offset = SIRFSOC_DMA_INT_EN;
-		width_offset = SIRFSOC_DMA_WIDTH_0;
-	}
-
-	writel_relaxed(save->interrupt_en, sdma->base + int_offset);
-	for (ch = 0; ch < count; ch++) {
-		schan = &sdma->channels[ch];
-		if (list_empty(&schan->active))
-			continue;
-		sdesc = list_first_entry(&schan->active,
-			struct sirfsoc_dma_desc,
-			node);
-		writel_relaxed(sdesc->width,
-			sdma->base + width_offset + ch * 4);
-		writel_relaxed(sdesc->xlen,
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_XLEN);
-		writel_relaxed(sdesc->ylen,
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_YLEN);
-		writel_relaxed(save->ctrl[ch],
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
-		if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-			writel_relaxed(sdesc->addr,
-				sdma->base + SIRFSOC_DMA_CH_ADDR);
-		} else {
-			writel_relaxed(sdesc->addr >> 2,
-				sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR);
-
-		}
-	}
-
-	/* if we were runtime-suspended before, suspend again */
-	if (pm_runtime_status_suspended(dev))
-		sirfsoc_dma_runtime_suspend(dev);
-
-	return 0;
-}
-
-static const struct dev_pm_ops sirfsoc_dma_pm_ops = {
-	SET_RUNTIME_PM_OPS(sirfsoc_dma_runtime_suspend, sirfsoc_dma_runtime_resume, NULL)
-	SET_SYSTEM_SLEEP_PM_OPS(sirfsoc_dma_pm_suspend, sirfsoc_dma_pm_resume)
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a6 = {
-	.exec = sirfsoc_dma_execute_hw_a6,
-	.type = SIRFSOC_DMA_VER_A6,
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a7v1 = {
-	.exec = sirfsoc_dma_execute_hw_a7v1,
-	.type = SIRFSOC_DMA_VER_A7V1,
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a7v2 = {
-	.exec = sirfsoc_dma_execute_hw_a7v2,
-	.type = SIRFSOC_DMA_VER_A7V2,
-};
-
-static const struct of_device_id sirfsoc_dma_match[] = {
-	{ .compatible = "sirf,prima2-dmac", .data = &sirfsoc_dmadata_a6,},
-	{ .compatible = "sirf,atlas7-dmac", .data = &sirfsoc_dmadata_a7v1,},
-	{ .compatible = "sirf,atlas7-dmac-v2", .data = &sirfsoc_dmadata_a7v2,},
-	{},
-};
-MODULE_DEVICE_TABLE(of, sirfsoc_dma_match);
-
-static struct platform_driver sirfsoc_dma_driver = {
-	.probe		= sirfsoc_dma_probe,
-	.remove		= sirfsoc_dma_remove,
-	.driver = {
-		.name = DRV_NAME,
-		.pm = &sirfsoc_dma_pm_ops,
-		.of_match_table	= sirfsoc_dma_match,
-	},
-};
-
-static __init int sirfsoc_dma_init(void)
-{
-	return platform_driver_register(&sirfsoc_dma_driver);
-}
-
-static void __exit sirfsoc_dma_exit(void)
-{
-	platform_driver_unregister(&sirfsoc_dma_driver);
-}
-
-subsys_initcall(sirfsoc_dma_init);
-module_exit(sirfsoc_dma_exit);
-
-MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>");
-MODULE_AUTHOR("Barry Song <baohua.song@csr.com>");
-MODULE_DESCRIPTION("SIRFSOC DMA control driver");
-MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sirfsoc_dma.h b/include/linux/sirfsoc_dma.h
deleted file mode 100644
index 50161b6afb61..000000000000
--- a/include/linux/sirfsoc_dma.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SIRFSOC_DMA_H_
-#define _SIRFSOC_DMA_H_
-
-bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id);
-
-#endif
-- 
cgit v1.2.3


From a033a74e8b66336fc2ea379842be6bcf176cbfbc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:18:59 +0100
Subject: dmaengine: remove coh901318 driver

The ST-Ericsson U300 platform is getting removed, so this driver is no
longer needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210120131859.2056308-4-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/ste-coh901318.txt      |   32 -
 drivers/dma/Kconfig                                |    7 -
 drivers/dma/Makefile                               |    1 -
 drivers/dma/coh901318.c                            | 2808 --------------------
 drivers/dma/coh901318.h                            |  141 -
 drivers/dma/coh901318_lli.c                        |  313 ---
 include/linux/platform_data/dma-coh901318.h        |   72 -
 7 files changed, 3374 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/dma/ste-coh901318.txt
 delete mode 100644 drivers/dma/coh901318.c
 delete mode 100644 drivers/dma/coh901318.h
 delete mode 100644 drivers/dma/coh901318_lli.c
 delete mode 100644 include/linux/platform_data/dma-coh901318.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/dma/ste-coh901318.txt b/Documentation/devicetree/bindings/dma/ste-coh901318.txt
deleted file mode 100644
index 091ad057e9cf..000000000000
--- a/Documentation/devicetree/bindings/dma/ste-coh901318.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-ST-Ericsson COH 901 318 DMA Controller
-
-This is a DMA controller which has begun as a fork of the
-ARM PL08x PrimeCell VHDL code.
-
-Required properties:
-- compatible: should be "stericsson,coh901318"
-- reg: register locations and length
-- interrupts: the single DMA IRQ
-- #dma-cells: must be set to <1>, as the channels on the
-  COH 901 318 are simple and identified by a single number
-- dma-channels: the number of DMA channels handled
-
-Example:
-
-dmac: dma-controller@c00020000 {
-	compatible = "stericsson,coh901318";
-	reg = <0xc0020000 0x1000>;
-	interrupt-parent = <&vica>;
-	interrupts = <2>;
-	#dma-cells = <1>;
-	dma-channels = <40>;
-};
-
-Consumers example:
-
-uart0: serial@c0013000 {
-	compatible = "...";
-	(...)
-	dmas = <&dmac 17 &dmac 18>;
-	dma-names = "tx", "rx";
-};
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 4fd6a90cfadf..725cda4aa2a3 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -124,13 +124,6 @@ config BCM_SBA_RAID
 	  has the capability to offload memcpy, xor and pq computation
 	  for raid5/6.
 
-config COH901318
-	bool "ST-Ericsson COH901318 DMA support"
-	select DMA_ENGINE
-	depends on ARCH_U300 || COMPILE_TEST
-	help
-	  Enable support for ST-Ericsson COH 901 318 DMA.
-
 config DMA_BCM2835
 	tristate "BCM2835 DMA engine support"
 	depends on ARCH_BCM2835
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 4aed5ea59922..aa69094e3547 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
 obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
-obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4780) += dma-jz4780.o
 obj-$(CONFIG_DMA_SA11X0) += sa11x0-dma.o
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
deleted file mode 100644
index 95b9b2f5358e..000000000000
--- a/drivers/dma/coh901318.c
+++ /dev/null
@@ -1,2808 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * driver/dma/coh901318.c
- *
- * Copyright (C) 2007-2009 ST-Ericsson
- * DMA driver for COH 901 318
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h> /* printk() */
-#include <linux/fs.h> /* everything... */
-#include <linux/scatterlist.h>
-#include <linux/slab.h> /* kmalloc() */
-#include <linux/dmaengine.h>
-#include <linux/platform_device.h>
-#include <linux/device.h>
-#include <linux/irqreturn.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/platform_data/dma-coh901318.h>
-#include <linux/of_dma.h>
-
-#include "coh901318.h"
-#include "dmaengine.h"
-
-#define COH901318_MOD32_MASK					(0x1F)
-#define COH901318_WORD_MASK					(0xFFFFFFFF)
-/* INT_STATUS - Interrupt Status Registers 32bit (R/-) */
-#define COH901318_INT_STATUS1					(0x0000)
-#define COH901318_INT_STATUS2					(0x0004)
-/* TC_INT_STATUS - Terminal Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_TC_INT_STATUS1				(0x0008)
-#define COH901318_TC_INT_STATUS2				(0x000C)
-/* TC_INT_CLEAR - Terminal Count Interrupt Clear Registers 32bit (-/W) */
-#define COH901318_TC_INT_CLEAR1					(0x0010)
-#define COH901318_TC_INT_CLEAR2					(0x0014)
-/* RAW_TC_INT_STATUS - Raw Term Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_RAW_TC_INT_STATUS1				(0x0018)
-#define COH901318_RAW_TC_INT_STATUS2				(0x001C)
-/* BE_INT_STATUS - Bus Error Interrupt Status Registers 32bit (R/-) */
-#define COH901318_BE_INT_STATUS1				(0x0020)
-#define COH901318_BE_INT_STATUS2				(0x0024)
-/* BE_INT_CLEAR - Bus Error Interrupt Clear Registers 32bit (-/W) */
-#define COH901318_BE_INT_CLEAR1					(0x0028)
-#define COH901318_BE_INT_CLEAR2					(0x002C)
-/* RAW_BE_INT_STATUS - Raw Term Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_RAW_BE_INT_STATUS1				(0x0030)
-#define COH901318_RAW_BE_INT_STATUS2				(0x0034)
-
-/*
- * CX_CFG - Channel Configuration Registers 32bit (R/W)
- */
-#define COH901318_CX_CFG					(0x0100)
-#define COH901318_CX_CFG_SPACING				(0x04)
-/* Channel enable activates tha dma job */
-#define COH901318_CX_CFG_CH_ENABLE				(0x00000001)
-#define COH901318_CX_CFG_CH_DISABLE				(0x00000000)
-/* Request Mode */
-#define COH901318_CX_CFG_RM_MASK				(0x00000006)
-#define COH901318_CX_CFG_RM_MEMORY_TO_MEMORY			(0x0 << 1)
-#define COH901318_CX_CFG_RM_PRIMARY_TO_MEMORY			(0x1 << 1)
-#define COH901318_CX_CFG_RM_MEMORY_TO_PRIMARY			(0x1 << 1)
-#define COH901318_CX_CFG_RM_PRIMARY_TO_SECONDARY		(0x3 << 1)
-#define COH901318_CX_CFG_RM_SECONDARY_TO_PRIMARY		(0x3 << 1)
-/* Linked channel request field. RM must == 11 */
-#define COH901318_CX_CFG_LCRF_SHIFT				3
-#define COH901318_CX_CFG_LCRF_MASK				(0x000001F8)
-#define COH901318_CX_CFG_LCR_DISABLE				(0x00000000)
-/* Terminal Counter Interrupt Request Mask */
-#define COH901318_CX_CFG_TC_IRQ_ENABLE				(0x00000200)
-#define COH901318_CX_CFG_TC_IRQ_DISABLE				(0x00000000)
-/* Bus Error interrupt Mask */
-#define COH901318_CX_CFG_BE_IRQ_ENABLE				(0x00000400)
-#define COH901318_CX_CFG_BE_IRQ_DISABLE				(0x00000000)
-
-/*
- * CX_STAT - Channel Status Registers 32bit (R/-)
- */
-#define COH901318_CX_STAT					(0x0200)
-#define COH901318_CX_STAT_SPACING				(0x04)
-#define COH901318_CX_STAT_RBE_IRQ_IND				(0x00000008)
-#define COH901318_CX_STAT_RTC_IRQ_IND				(0x00000004)
-#define COH901318_CX_STAT_ACTIVE				(0x00000002)
-#define COH901318_CX_STAT_ENABLED				(0x00000001)
-
-/*
- * CX_CTRL - Channel Control Registers 32bit (R/W)
- */
-#define COH901318_CX_CTRL					(0x0400)
-#define COH901318_CX_CTRL_SPACING				(0x10)
-/* Transfer Count Enable */
-#define COH901318_CX_CTRL_TC_ENABLE				(0x00001000)
-#define COH901318_CX_CTRL_TC_DISABLE				(0x00000000)
-/* Transfer Count Value 0 - 4095 */
-#define COH901318_CX_CTRL_TC_VALUE_MASK				(0x00000FFF)
-/* Burst count */
-#define COH901318_CX_CTRL_BURST_COUNT_MASK			(0x0000E000)
-#define COH901318_CX_CTRL_BURST_COUNT_64_BYTES			(0x7 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_48_BYTES			(0x6 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_32_BYTES			(0x5 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_16_BYTES			(0x4 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_8_BYTES			(0x3 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_4_BYTES			(0x2 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_2_BYTES			(0x1 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_1_BYTE			(0x0 << 13)
-/* Source bus size  */
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_MASK			(0x00030000)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS			(0x2 << 16)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_16_BITS			(0x1 << 16)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_8_BITS			(0x0 << 16)
-/* Source address increment */
-#define COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE			(0x00040000)
-#define COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE			(0x00000000)
-/* Destination Bus Size */
-#define COH901318_CX_CTRL_DST_BUS_SIZE_MASK			(0x00180000)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS			(0x2 << 19)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_16_BITS			(0x1 << 19)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_8_BITS			(0x0 << 19)
-/* Destination address increment */
-#define COH901318_CX_CTRL_DST_ADDR_INC_ENABLE			(0x00200000)
-#define COH901318_CX_CTRL_DST_ADDR_INC_DISABLE			(0x00000000)
-/* Master Mode (Master2 is only connected to MSL) */
-#define COH901318_CX_CTRL_MASTER_MODE_MASK			(0x00C00000)
-#define COH901318_CX_CTRL_MASTER_MODE_M2R_M1W			(0x3 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M1R_M2W			(0x2 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M2RW			(0x1 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M1RW			(0x0 << 22)
-/* Terminal Count flag to PER enable */
-#define COH901318_CX_CTRL_TCP_ENABLE				(0x01000000)
-#define COH901318_CX_CTRL_TCP_DISABLE				(0x00000000)
-/* Terminal Count flags to CPU enable */
-#define COH901318_CX_CTRL_TC_IRQ_ENABLE				(0x02000000)
-#define COH901318_CX_CTRL_TC_IRQ_DISABLE			(0x00000000)
-/* Hand shake to peripheral */
-#define COH901318_CX_CTRL_HSP_ENABLE				(0x04000000)
-#define COH901318_CX_CTRL_HSP_DISABLE				(0x00000000)
-#define COH901318_CX_CTRL_HSS_ENABLE				(0x08000000)
-#define COH901318_CX_CTRL_HSS_DISABLE				(0x00000000)
-/* DMA mode */
-#define COH901318_CX_CTRL_DDMA_MASK				(0x30000000)
-#define COH901318_CX_CTRL_DDMA_LEGACY				(0x0 << 28)
-#define COH901318_CX_CTRL_DDMA_DEMAND_DMA1			(0x1 << 28)
-#define COH901318_CX_CTRL_DDMA_DEMAND_DMA2			(0x2 << 28)
-/* Primary Request Data Destination */
-#define COH901318_CX_CTRL_PRDD_MASK				(0x40000000)
-#define COH901318_CX_CTRL_PRDD_DEST				(0x1 << 30)
-#define COH901318_CX_CTRL_PRDD_SOURCE				(0x0 << 30)
-
-/*
- * CX_SRC_ADDR - Channel Source Address Registers 32bit (R/W)
- */
-#define COH901318_CX_SRC_ADDR					(0x0404)
-#define COH901318_CX_SRC_ADDR_SPACING				(0x10)
-
-/*
- * CX_DST_ADDR - Channel Destination Address Registers 32bit R/W
- */
-#define COH901318_CX_DST_ADDR					(0x0408)
-#define COH901318_CX_DST_ADDR_SPACING				(0x10)
-
-/*
- * CX_LNK_ADDR - Channel Link Address Registers 32bit (R/W)
- */
-#define COH901318_CX_LNK_ADDR					(0x040C)
-#define COH901318_CX_LNK_ADDR_SPACING				(0x10)
-#define COH901318_CX_LNK_LINK_IMMEDIATE				(0x00000001)
-
-/**
- * struct coh901318_params - parameters for DMAC configuration
- * @config: DMA config register
- * @ctrl_lli_last: DMA control register for the last lli in the list
- * @ctrl_lli: DMA control register for an lli
- * @ctrl_lli_chained: DMA control register for a chained lli
- */
-struct coh901318_params {
-	u32 config;
-	u32 ctrl_lli_last;
-	u32 ctrl_lli;
-	u32 ctrl_lli_chained;
-};
-
-/**
- * struct coh_dma_channel - dma channel base
- * @name: ascii name of dma channel
- * @number: channel id number
- * @desc_nbr_max: number of preallocated descriptors
- * @priority_high: prio of channel, 0 low otherwise high.
- * @param: configuration parameters
- */
-struct coh_dma_channel {
-	const char name[32];
-	const int number;
-	const int desc_nbr_max;
-	const int priority_high;
-	const struct coh901318_params param;
-};
-
-/**
- * struct powersave - DMA power save structure
- * @lock: lock protecting data in this struct
- * @started_channels: bit mask indicating active dma channels
- */
-struct powersave {
-	spinlock_t lock;
-	u64 started_channels;
-};
-
-/* points out all dma slave channels.
- * Syntax is [A1, B1, A2, B2, .... ,-1,-1]
- * Select all channels from A to B, end of list is marked with -1,-1
- */
-static int dma_slave_channels[] = {
-	U300_DMA_MSL_TX_0, U300_DMA_SPI_RX,
-	U300_DMA_UART1_TX, U300_DMA_UART1_RX, -1, -1};
-
-/* points out all dma memcpy channels. */
-static int dma_memcpy_channels[] = {
-	U300_DMA_GENERAL_PURPOSE_0, U300_DMA_GENERAL_PURPOSE_8, -1, -1};
-
-#define flags_memcpy_config (COH901318_CX_CFG_CH_DISABLE | \
-			COH901318_CX_CFG_RM_MEMORY_TO_MEMORY | \
-			COH901318_CX_CFG_LCR_DISABLE | \
-			COH901318_CX_CFG_TC_IRQ_ENABLE | \
-			COH901318_CX_CFG_BE_IRQ_ENABLE)
-#define flags_memcpy_lli_chained (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_DISABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-#define flags_memcpy_lli (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_DISABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-#define flags_memcpy_lli_last (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_ENABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-
-static const struct coh_dma_channel chan_config[U300_DMA_CHANNELS] = {
-	{
-		.number = U300_DMA_MSL_TX_0,
-		.name = "MSL TX 0",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_TX_1,
-		.name = "MSL TX 1",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_2,
-		.name = "MSL TX 2",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.desc_nbr_max = 10,
-	},
-	{
-		.number = U300_DMA_MSL_TX_3,
-		.name = "MSL TX 3",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_4,
-		.name = "MSL TX 4",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_5,
-		.name = "MSL TX 5",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_TX_6,
-		.name = "MSL TX 6",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_RX_0,
-		.name = "MSL RX 0",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_RX_1,
-		.name = "MSL RX 1",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_2,
-		.name = "MSL RX 2",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_3,
-		.name = "MSL RX 3",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_4,
-		.name = "MSL RX 4",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_5,
-		.name = "MSL RX 5",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_6,
-		.name = "MSL RX 6",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_MMCSD_RX_TX,
-		.name = "MMCSD RX TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-
-	},
-	{
-		.number = U300_DMA_MSPRO_TX,
-		.name = "MSPRO TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSPRO_RX,
-		.name = "MSPRO RX",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_UART0_TX,
-		.name = "UART0 TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_UART0_RX,
-		.name = "UART0 RX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_APEX_TX,
-		.name = "APEX TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_APEX_RX,
-		.name = "APEX RX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_PCM_I2S0_TX,
-		.name = "PCM I2S0 TX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_PCM_I2S0_RX,
-		.name = "PCM I2S0 RX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_PCM_I2S1_TX,
-		.name = "PCM I2S1 TX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_PCM_I2S1_RX,
-		.name = "PCM I2S1 RX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_XGAM_CDI,
-		.name = "XGAM CDI",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_XGAM_PDI,
-		.name = "XGAM PDI",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_SPI_TX,
-		.name = "SPI TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_SPI_RX,
-		.name = "SPI RX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_0,
-		.name = "GENERAL 00",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_1,
-		.name = "GENERAL 01",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_2,
-		.name = "GENERAL 02",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_3,
-		.name = "GENERAL 03",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_4,
-		.name = "GENERAL 04",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_5,
-		.name = "GENERAL 05",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_6,
-		.name = "GENERAL 06",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_7,
-		.name = "GENERAL 07",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_8,
-		.name = "GENERAL 08",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_UART1_TX,
-		.name = "UART1 TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_UART1_RX,
-		.name = "UART1 RX",
-		.priority_high = 0,
-	}
-};
-
-#define COHC_2_DEV(cohc) (&cohc->chan.dev->device)
-
-#ifdef VERBOSE_DEBUG
-#define COH_DBG(x) ({ if (1) x; 0; })
-#else
-#define COH_DBG(x) ({ if (0) x; 0; })
-#endif
-
-struct coh901318_desc {
-	struct dma_async_tx_descriptor desc;
-	struct list_head node;
-	struct scatterlist *sg;
-	unsigned int sg_len;
-	struct coh901318_lli *lli;
-	enum dma_transfer_direction dir;
-	unsigned long flags;
-	u32 head_config;
-	u32 head_ctrl;
-};
-
-struct coh901318_base {
-	struct device *dev;
-	void __iomem *virtbase;
-	unsigned int irq;
-	struct coh901318_pool pool;
-	struct powersave pm;
-	struct dma_device dma_slave;
-	struct dma_device dma_memcpy;
-	struct coh901318_chan *chans;
-};
-
-struct coh901318_chan {
-	spinlock_t lock;
-	int allocated;
-	int id;
-	int stopped;
-
-	struct work_struct free_work;
-	struct dma_chan chan;
-
-	struct tasklet_struct tasklet;
-
-	struct list_head active;
-	struct list_head queue;
-	struct list_head free;
-
-	unsigned long nbr_active_done;
-	unsigned long busy;
-
-	struct dma_slave_config config;
-	u32 addr;
-	u32 ctrl;
-
-	struct coh901318_base *base;
-};
-
-static void coh901318_list_print(struct coh901318_chan *cohc,
-				 struct coh901318_lli *lli)
-{
-	struct coh901318_lli *l = lli;
-	int i = 0;
-
-	while (l) {
-		dev_vdbg(COHC_2_DEV(cohc), "i %d, lli %p, ctrl 0x%x, src %pad"
-			 ", dst %pad, link %pad virt_link_addr 0x%p\n",
-			 i, l, l->control, &l->src_addr, &l->dst_addr,
-			 &l->link_addr, l->virt_link_addr);
-		i++;
-		l = l->virt_link_addr;
-	}
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-#define COH901318_DEBUGFS_ASSIGN(x, y) (x = y)
-
-static struct coh901318_base *debugfs_dma_base;
-static struct dentry *dma_dentry;
-
-static ssize_t coh901318_debugfs_read(struct file *file, char __user *buf,
-				  size_t count, loff_t *f_pos)
-{
-	u64 started_channels = debugfs_dma_base->pm.started_channels;
-	int pool_count = debugfs_dma_base->pool.debugfs_pool_counter;
-	char *dev_buf;
-	char *tmp;
-	int ret;
-	int i;
-
-	dev_buf = kmalloc(4*1024, GFP_KERNEL);
-	if (dev_buf == NULL)
-		return -ENOMEM;
-	tmp = dev_buf;
-
-	tmp += sprintf(tmp, "DMA -- enabled dma channels\n");
-
-	for (i = 0; i < U300_DMA_CHANNELS; i++) {
-		if (started_channels & (1ULL << i))
-			tmp += sprintf(tmp, "channel %d\n", i);
-	}
-
-	tmp += sprintf(tmp, "Pool alloc nbr %d\n", pool_count);
-
-	ret = simple_read_from_buffer(buf, count, f_pos, dev_buf, 
-					tmp - dev_buf);
-	kfree(dev_buf);
-	return ret;
-}
-
-static const struct file_operations coh901318_debugfs_status_operations = {
-	.open		= simple_open,
-	.read		= coh901318_debugfs_read,
-	.llseek		= default_llseek,
-};
-
-
-static int __init init_coh901318_debugfs(void)
-{
-
-	dma_dentry = debugfs_create_dir("dma", NULL);
-
-	debugfs_create_file("status", S_IFREG | S_IRUGO, dma_dentry, NULL,
-			    &coh901318_debugfs_status_operations);
-	return 0;
-}
-
-static void __exit exit_coh901318_debugfs(void)
-{
-	debugfs_remove_recursive(dma_dentry);
-}
-
-module_init(init_coh901318_debugfs);
-module_exit(exit_coh901318_debugfs);
-#else
-
-#define COH901318_DEBUGFS_ASSIGN(x, y)
-
-#endif /* CONFIG_DEBUG_FS */
-
-static inline struct coh901318_chan *to_coh901318_chan(struct dma_chan *chan)
-{
-	return container_of(chan, struct coh901318_chan, chan);
-}
-
-static int coh901318_dma_set_runtimeconfig(struct dma_chan *chan,
-					   struct dma_slave_config *config,
-					   enum dma_transfer_direction direction);
-
-static inline const struct coh901318_params *
-cohc_chan_param(struct coh901318_chan *cohc)
-{
-	return &chan_config[cohc->id].param;
-}
-
-static inline const struct coh_dma_channel *
-cohc_chan_conf(struct coh901318_chan *cohc)
-{
-	return &chan_config[cohc->id];
-}
-
-static void enable_powersave(struct coh901318_chan *cohc)
-{
-	unsigned long flags;
-	struct powersave *pm = &cohc->base->pm;
-
-	spin_lock_irqsave(&pm->lock, flags);
-
-	pm->started_channels &= ~(1ULL << cohc->id);
-
-	spin_unlock_irqrestore(&pm->lock, flags);
-}
-static void disable_powersave(struct coh901318_chan *cohc)
-{
-	unsigned long flags;
-	struct powersave *pm = &cohc->base->pm;
-
-	spin_lock_irqsave(&pm->lock, flags);
-
-	pm->started_channels |= (1ULL << cohc->id);
-
-	spin_unlock_irqrestore(&pm->lock, flags);
-}
-
-static inline int coh901318_set_ctrl(struct coh901318_chan *cohc, u32 control)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	writel(control,
-	       virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING * channel);
-	return 0;
-}
-
-static inline int coh901318_set_conf(struct coh901318_chan *cohc, u32 conf)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	writel(conf,
-	       virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING*channel);
-	return 0;
-}
-
-
-static int coh901318_start(struct coh901318_chan *cohc)
-{
-	u32 val;
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	disable_powersave(cohc);
-
-	val = readl(virtbase + COH901318_CX_CFG +
-		    COH901318_CX_CFG_SPACING * channel);
-
-	/* Enable channel */
-	val |= COH901318_CX_CFG_CH_ENABLE;
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-
-	return 0;
-}
-
-static int coh901318_prep_linked_list(struct coh901318_chan *cohc,
-				      struct coh901318_lli *lli)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	BUG_ON(readl(virtbase + COH901318_CX_STAT +
-		     COH901318_CX_STAT_SPACING*channel) &
-	       COH901318_CX_STAT_ACTIVE);
-
-	writel(lli->src_addr,
-	       virtbase + COH901318_CX_SRC_ADDR +
-	       COH901318_CX_SRC_ADDR_SPACING * channel);
-
-	writel(lli->dst_addr, virtbase +
-	       COH901318_CX_DST_ADDR +
-	       COH901318_CX_DST_ADDR_SPACING * channel);
-
-	writel(lli->link_addr, virtbase + COH901318_CX_LNK_ADDR +
-	       COH901318_CX_LNK_ADDR_SPACING * channel);
-
-	writel(lli->control, virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING * channel);
-
-	return 0;
-}
-
-static struct coh901318_desc *
-coh901318_desc_get(struct coh901318_chan *cohc)
-{
-	struct coh901318_desc *desc;
-
-	if (list_empty(&cohc->free)) {
-		/* alloc new desc because we're out of used ones
-		 * TODO: alloc a pile of descs instead of just one,
-		 * avoid many small allocations.
-		 */
-		desc = kzalloc(sizeof(struct coh901318_desc), GFP_NOWAIT);
-		if (desc == NULL)
-			goto out;
-		INIT_LIST_HEAD(&desc->node);
-		dma_async_tx_descriptor_init(&desc->desc, &cohc->chan);
-	} else {
-		/* Reuse an old desc. */
-		desc = list_first_entry(&cohc->free,
-					struct coh901318_desc,
-					node);
-		list_del(&desc->node);
-		/* Initialize it a bit so it's not insane */
-		desc->sg = NULL;
-		desc->sg_len = 0;
-		desc->desc.callback = NULL;
-		desc->desc.callback_param = NULL;
-	}
-
- out:
-	return desc;
-}
-
-static void
-coh901318_desc_free(struct coh901318_chan *cohc, struct coh901318_desc *cohd)
-{
-	list_add_tail(&cohd->node, &cohc->free);
-}
-
-/* call with irq lock held */
-static void
-coh901318_desc_submit(struct coh901318_chan *cohc, struct coh901318_desc *desc)
-{
-	list_add_tail(&desc->node, &cohc->active);
-}
-
-static struct coh901318_desc *
-coh901318_first_active_get(struct coh901318_chan *cohc)
-{
-	return list_first_entry_or_null(&cohc->active, struct coh901318_desc,
-					node);
-}
-
-static void
-coh901318_desc_remove(struct coh901318_desc *cohd)
-{
-	list_del(&cohd->node);
-}
-
-static void
-coh901318_desc_queue(struct coh901318_chan *cohc, struct coh901318_desc *desc)
-{
-	list_add_tail(&desc->node, &cohc->queue);
-}
-
-static struct coh901318_desc *
-coh901318_first_queued(struct coh901318_chan *cohc)
-{
-	return list_first_entry_or_null(&cohc->queue, struct coh901318_desc,
-					node);
-}
-
-static inline u32 coh901318_get_bytes_in_lli(struct coh901318_lli *in_lli)
-{
-	struct coh901318_lli *lli = in_lli;
-	u32 bytes = 0;
-
-	while (lli) {
-		bytes += lli->control & COH901318_CX_CTRL_TC_VALUE_MASK;
-		lli = lli->virt_link_addr;
-	}
-	return bytes;
-}
-
-/*
- * Get the number of bytes left to transfer on this channel,
- * it is unwise to call this before stopping the channel for
- * absolute measures, but for a rough guess you can still call
- * it.
- */
-static u32 coh901318_get_bytes_left(struct dma_chan *chan)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_desc *cohd;
-	struct list_head *pos;
-	unsigned long flags;
-	u32 left = 0;
-	int i = 0;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * If there are many queued jobs, we iterate and add the
-	 * size of them all. We take a special look on the first
-	 * job though, since it is probably active.
-	 */
-	list_for_each(pos, &cohc->active) {
-		/*
-		 * The first job in the list will be working on the
-		 * hardware. The job can be stopped but still active,
-		 * so that the transfer counter is somewhere inside
-		 * the buffer.
-		 */
-		cohd = list_entry(pos, struct coh901318_desc, node);
-
-		if (i == 0) {
-			struct coh901318_lli *lli;
-			dma_addr_t ladd;
-
-			/* Read current transfer count value */
-			left = readl(cohc->base->virtbase +
-				     COH901318_CX_CTRL +
-				     COH901318_CX_CTRL_SPACING * cohc->id) &
-				COH901318_CX_CTRL_TC_VALUE_MASK;
-
-			/* See if the transfer is linked... */
-			ladd = readl(cohc->base->virtbase +
-				     COH901318_CX_LNK_ADDR +
-				     COH901318_CX_LNK_ADDR_SPACING *
-				     cohc->id) &
-				~COH901318_CX_LNK_LINK_IMMEDIATE;
-			/* Single transaction */
-			if (!ladd)
-				continue;
-
-			/*
-			 * Linked transaction, follow the lli, find the
-			 * currently processing lli, and proceed to the next
-			 */
-			lli = cohd->lli;
-			while (lli && lli->link_addr != ladd)
-				lli = lli->virt_link_addr;
-
-			if (lli)
-				lli = lli->virt_link_addr;
-
-			/*
-			 * Follow remaining lli links around to count the total
-			 * number of bytes left
-			 */
-			left += coh901318_get_bytes_in_lli(lli);
-		} else {
-			left += coh901318_get_bytes_in_lli(cohd->lli);
-		}
-		i++;
-	}
-
-	/* Also count bytes in the queued jobs */
-	list_for_each(pos, &cohc->queue) {
-		cohd = list_entry(pos, struct coh901318_desc, node);
-		left += coh901318_get_bytes_in_lli(cohd->lli);
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return left;
-}
-
-/*
- * Pauses a transfer without losing data. Enables power save.
- * Use this function in conjunction with coh901318_resume.
- */
-static int coh901318_pause(struct dma_chan *chan)
-{
-	u32 val;
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Disable channel in HW */
-	val = readl(virtbase + COH901318_CX_CFG +
-		    COH901318_CX_CFG_SPACING * channel);
-
-	/* Stopping infinite transfer */
-	if ((val & COH901318_CX_CTRL_TC_ENABLE) == 0 &&
-	    (val & COH901318_CX_CFG_CH_ENABLE))
-		cohc->stopped = 1;
-
-
-	val &= ~COH901318_CX_CFG_CH_ENABLE;
-	/* Enable twice, HW bug work around */
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-
-	/* Spin-wait for it to actually go inactive */
-	while (readl(virtbase + COH901318_CX_STAT+COH901318_CX_STAT_SPACING *
-		     channel) & COH901318_CX_STAT_ACTIVE)
-		cpu_relax();
-
-	/* Check if we stopped an active job */
-	if ((readl(virtbase + COH901318_CX_CTRL+COH901318_CX_CTRL_SPACING *
-		   channel) & COH901318_CX_CTRL_TC_VALUE_MASK) > 0)
-		cohc->stopped = 1;
-
-	enable_powersave(cohc);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	return 0;
-}
-
-/* Resumes a transfer that has been stopped via 300_dma_stop(..).
-   Power save is handled.
-*/
-static int coh901318_resume(struct dma_chan *chan)
-{
-	u32 val;
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	disable_powersave(cohc);
-
-	if (cohc->stopped) {
-		/* Enable channel in HW */
-		val = readl(cohc->base->virtbase + COH901318_CX_CFG +
-			    COH901318_CX_CFG_SPACING * channel);
-
-		val |= COH901318_CX_CFG_CH_ENABLE;
-
-		writel(val, cohc->base->virtbase + COH901318_CX_CFG +
-		       COH901318_CX_CFG_SPACING*channel);
-
-		cohc->stopped = 0;
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	return 0;
-}
-
-bool coh901318_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	unsigned long ch_nr = (unsigned long) chan_id;
-
-	if (ch_nr == to_coh901318_chan(chan)->id)
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(coh901318_filter_id);
-
-struct coh901318_filter_args {
-	struct coh901318_base *base;
-	unsigned int ch_nr;
-};
-
-static bool coh901318_filter_base_and_id(struct dma_chan *chan, void *data)
-{
-	struct coh901318_filter_args *args = data;
-
-	if (&args->base->dma_slave == chan->device &&
-	    args->ch_nr == to_coh901318_chan(chan)->id)
-		return true;
-
-	return false;
-}
-
-static struct dma_chan *coh901318_xlate(struct of_phandle_args *dma_spec,
-					struct of_dma *ofdma)
-{
-	struct coh901318_filter_args args = {
-		.base = ofdma->of_dma_data,
-		.ch_nr = dma_spec->args[0],
-	};
-	dma_cap_mask_t cap;
-	dma_cap_zero(cap);
-	dma_cap_set(DMA_SLAVE, cap);
-
-	return dma_request_channel(cap, coh901318_filter_base_and_id, &args);
-}
-/*
- * DMA channel allocation
- */
-static int coh901318_config(struct coh901318_chan *cohc,
-			    struct coh901318_params *param)
-{
-	const struct coh901318_params *p;
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	if (param)
-		p = param;
-	else
-		p = cohc_chan_param(cohc);
-
-	/* Clear any pending BE or TC interrupt */
-	if (channel < 32) {
-		writel(1 << channel, virtbase + COH901318_BE_INT_CLEAR1);
-		writel(1 << channel, virtbase + COH901318_TC_INT_CLEAR1);
-	} else {
-		writel(1 << (channel - 32), virtbase +
-		       COH901318_BE_INT_CLEAR2);
-		writel(1 << (channel - 32), virtbase +
-		       COH901318_TC_INT_CLEAR2);
-	}
-
-	coh901318_set_conf(cohc, p->config);
-	coh901318_set_ctrl(cohc, p->ctrl_lli_last);
-
-	return 0;
-}
-
-/* must lock when calling this function
- * start queued jobs, if any
- * TODO: start all queued jobs in one go
- *
- * Returns descriptor if queued job is started otherwise NULL.
- * If the queue is empty NULL is returned.
- */
-static struct coh901318_desc *coh901318_queue_start(struct coh901318_chan *cohc)
-{
-	struct coh901318_desc *cohd;
-
-	/*
-	 * start queued jobs, if any
-	 * TODO: transmit all queued jobs in one go
-	 */
-	cohd = coh901318_first_queued(cohc);
-
-	if (cohd != NULL) {
-		/* Remove from queue */
-		coh901318_desc_remove(cohd);
-		/* initiate DMA job */
-		cohc->busy = 1;
-
-		coh901318_desc_submit(cohc, cohd);
-
-		/* Program the transaction head */
-		coh901318_set_conf(cohc, cohd->head_config);
-		coh901318_set_ctrl(cohc, cohd->head_ctrl);
-		coh901318_prep_linked_list(cohc, cohd->lli);
-
-		/* start dma job on this channel */
-		coh901318_start(cohc);
-
-	}
-
-	return cohd;
-}
-
-/*
- * This tasklet is called from the interrupt handler to
- * handle each descriptor (DMA job) that is sent to a channel.
- */
-static void dma_tasklet(struct tasklet_struct *t)
-{
-	struct coh901318_chan *cohc = from_tasklet(cohc, t, tasklet);
-	struct coh901318_desc *cohd_fin;
-	unsigned long flags;
-	struct dmaengine_desc_callback cb;
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] chan_id %d"
-		 " nbr_active_done %ld\n", __func__,
-		 cohc->id, cohc->nbr_active_done);
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* get first active descriptor entry from list */
-	cohd_fin = coh901318_first_active_get(cohc);
-
-	if (cohd_fin == NULL)
-		goto err;
-
-	/* locate callback to client */
-	dmaengine_desc_get_callback(&cohd_fin->desc, &cb);
-
-	/* sign this job as completed on the channel */
-	dma_cookie_complete(&cohd_fin->desc);
-
-	/* release the lli allocation and remove the descriptor */
-	coh901318_lli_free(&cohc->base->pool, &cohd_fin->lli);
-
-	/* return desc to free-list */
-	coh901318_desc_remove(cohd_fin);
-	coh901318_desc_free(cohc, cohd_fin);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	/* Call the callback when we're done */
-	dmaengine_desc_callback_invoke(&cb, NULL);
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * If another interrupt fired while the tasklet was scheduling,
-	 * we don't get called twice, so we have this number of active
-	 * counter that keep track of the number of IRQs expected to
-	 * be handled for this channel. If there happen to be more than
-	 * one IRQ to be ack:ed, we simply schedule this tasklet again.
-	 */
-	cohc->nbr_active_done--;
-	if (cohc->nbr_active_done) {
-		dev_dbg(COHC_2_DEV(cohc), "scheduling tasklet again, new IRQs "
-			"came in while we were scheduling this tasklet\n");
-		if (cohc_chan_conf(cohc)->priority_high)
-			tasklet_hi_schedule(&cohc->tasklet);
-		else
-			tasklet_schedule(&cohc->tasklet);
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return;
-
- err:
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	dev_err(COHC_2_DEV(cohc), "[%s] No active dma desc\n", __func__);
-}
-
-
-/* called from interrupt context */
-static void dma_tc_handle(struct coh901318_chan *cohc)
-{
-	/*
-	 * If the channel is not allocated, then we shouldn't have
-	 * any TC interrupts on it.
-	 */
-	if (!cohc->allocated) {
-		dev_err(COHC_2_DEV(cohc), "spurious interrupt from "
-			"unallocated channel\n");
-		return;
-	}
-
-	/*
-	 * When we reach this point, at least one queue item
-	 * should have been moved over from cohc->queue to
-	 * cohc->active and run to completion, that is why we're
-	 * getting a terminal count interrupt is it not?
-	 * If you get this BUG() the most probable cause is that
-	 * the individual nodes in the lli chain have IRQ enabled,
-	 * so check your platform config for lli chain ctrl.
-	 */
-	BUG_ON(list_empty(&cohc->active));
-
-	cohc->nbr_active_done++;
-
-	/*
-	 * This attempt to take a job from cohc->queue, put it
-	 * into cohc->active and start it.
-	 */
-	if (coh901318_queue_start(cohc) == NULL)
-		cohc->busy = 0;
-
-	/*
-	 * This tasklet will remove items from cohc->active
-	 * and thus terminates them.
-	 */
-	if (cohc_chan_conf(cohc)->priority_high)
-		tasklet_hi_schedule(&cohc->tasklet);
-	else
-		tasklet_schedule(&cohc->tasklet);
-}
-
-
-static irqreturn_t dma_irq_handler(int irq, void *dev_id)
-{
-	u32 status1;
-	u32 status2;
-	int i;
-	int ch;
-	struct coh901318_base *base  = dev_id;
-	struct coh901318_chan *cohc;
-	void __iomem *virtbase = base->virtbase;
-
-	status1 = readl(virtbase + COH901318_INT_STATUS1);
-	status2 = readl(virtbase + COH901318_INT_STATUS2);
-
-	if (unlikely(status1 == 0 && status2 == 0)) {
-		dev_warn(base->dev, "spurious DMA IRQ from no channel!\n");
-		return IRQ_HANDLED;
-	}
-
-	/* TODO: consider handle IRQ in tasklet here to
-	 *       minimize interrupt latency */
-
-	/* Check the first 32 DMA channels for IRQ */
-	while (status1) {
-		/* Find first bit set, return as a number. */
-		i = ffs(status1) - 1;
-		ch = i;
-
-		cohc = &base->chans[ch];
-		spin_lock(&cohc->lock);
-
-		/* Mask off this bit */
-		status1 &= ~(1 << i);
-		/* Check the individual channel bits */
-		if (test_bit(i, virtbase + COH901318_BE_INT_STATUS1)) {
-			dev_crit(COHC_2_DEV(cohc),
-				 "DMA bus error on channel %d!\n", ch);
-			BUG_ON(1);
-			/* Clear BE interrupt */
-			__set_bit(i, virtbase + COH901318_BE_INT_CLEAR1);
-		} else {
-			/* Caused by TC, really? */
-			if (unlikely(!test_bit(i, virtbase +
-					       COH901318_TC_INT_STATUS1))) {
-				dev_warn(COHC_2_DEV(cohc),
-					 "ignoring interrupt not caused by terminal count on channel %d\n", ch);
-				/* Clear TC interrupt */
-				BUG_ON(1);
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR1);
-			} else {
-				/* Enable powersave if transfer has finished */
-				if (!(readl(virtbase + COH901318_CX_STAT +
-					    COH901318_CX_STAT_SPACING*ch) &
-				      COH901318_CX_STAT_ENABLED)) {
-					enable_powersave(cohc);
-				}
-
-				/* Must clear TC interrupt before calling
-				 * dma_tc_handle
-				 * in case tc_handle initiate a new dma job
-				 */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR1);
-
-				dma_tc_handle(cohc);
-			}
-		}
-		spin_unlock(&cohc->lock);
-	}
-
-	/* Check the remaining 32 DMA channels for IRQ */
-	while (status2) {
-		/* Find first bit set, return as a number. */
-		i = ffs(status2) - 1;
-		ch = i + 32;
-		cohc = &base->chans[ch];
-		spin_lock(&cohc->lock);
-
-		/* Mask off this bit */
-		status2 &= ~(1 << i);
-		/* Check the individual channel bits */
-		if (test_bit(i, virtbase + COH901318_BE_INT_STATUS2)) {
-			dev_crit(COHC_2_DEV(cohc),
-				 "DMA bus error on channel %d!\n", ch);
-			/* Clear BE interrupt */
-			BUG_ON(1);
-			__set_bit(i, virtbase + COH901318_BE_INT_CLEAR2);
-		} else {
-			/* Caused by TC, really? */
-			if (unlikely(!test_bit(i, virtbase +
-					       COH901318_TC_INT_STATUS2))) {
-				dev_warn(COHC_2_DEV(cohc),
-					 "ignoring interrupt not caused by terminal count on channel %d\n", ch);
-				/* Clear TC interrupt */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR2);
-				BUG_ON(1);
-			} else {
-				/* Enable powersave if transfer has finished */
-				if (!(readl(virtbase + COH901318_CX_STAT +
-					    COH901318_CX_STAT_SPACING*ch) &
-				      COH901318_CX_STAT_ENABLED)) {
-					enable_powersave(cohc);
-				}
-				/* Must clear TC interrupt before calling
-				 * dma_tc_handle
-				 * in case tc_handle initiate a new dma job
-				 */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR2);
-
-				dma_tc_handle(cohc);
-			}
-		}
-		spin_unlock(&cohc->lock);
-	}
-
-	return IRQ_HANDLED;
-}
-
-static int coh901318_terminate_all(struct dma_chan *chan)
-{
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_desc *cohd;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	/* The remainder of this function terminates the transfer */
-	coh901318_pause(chan);
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Clear any pending BE or TC interrupt */
-	if (cohc->id < 32) {
-		writel(1 << cohc->id, virtbase + COH901318_BE_INT_CLEAR1);
-		writel(1 << cohc->id, virtbase + COH901318_TC_INT_CLEAR1);
-	} else {
-		writel(1 << (cohc->id - 32), virtbase +
-		       COH901318_BE_INT_CLEAR2);
-		writel(1 << (cohc->id - 32), virtbase +
-		       COH901318_TC_INT_CLEAR2);
-	}
-
-	enable_powersave(cohc);
-
-	while ((cohd = coh901318_first_active_get(cohc))) {
-		/* release the lli allocation*/
-		coh901318_lli_free(&cohc->base->pool, &cohd->lli);
-
-		/* return desc to free-list */
-		coh901318_desc_remove(cohd);
-		coh901318_desc_free(cohc, cohd);
-	}
-
-	while ((cohd = coh901318_first_queued(cohc))) {
-		/* release the lli allocation*/
-		coh901318_lli_free(&cohc->base->pool, &cohd->lli);
-
-		/* return desc to free-list */
-		coh901318_desc_remove(cohd);
-		coh901318_desc_free(cohc, cohd);
-	}
-
-
-	cohc->nbr_active_done = 0;
-	cohc->busy = 0;
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return 0;
-}
-
-static int coh901318_alloc_chan_resources(struct dma_chan *chan)
-{
-	struct coh901318_chan	*cohc = to_coh901318_chan(chan);
-	unsigned long flags;
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] DMA channel %d\n",
-		 __func__, cohc->id);
-
-	if (chan->client_count > 1)
-		return -EBUSY;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	coh901318_config(cohc, NULL);
-
-	cohc->allocated = 1;
-	dma_cookie_init(chan);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return 1;
-}
-
-static void
-coh901318_free_chan_resources(struct dma_chan *chan)
-{
-	struct coh901318_chan	*cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Disable HW */
-	writel(0x00000000U, cohc->base->virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING*channel);
-	writel(0x00000000U, cohc->base->virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING*channel);
-
-	cohc->allocated = 0;
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	coh901318_terminate_all(chan);
-}
-
-
-static dma_cookie_t
-coh901318_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-	struct coh901318_desc *cohd = container_of(tx, struct coh901318_desc,
-						   desc);
-	struct coh901318_chan *cohc = to_coh901318_chan(tx->chan);
-	unsigned long flags;
-	dma_cookie_t cookie;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-	cookie = dma_cookie_assign(tx);
-
-	coh901318_desc_queue(cohc, cohd);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return cookie;
-}
-
-static struct dma_async_tx_descriptor *
-coh901318_prep_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
-		      size_t size, unsigned long flags)
-{
-	struct coh901318_lli *lli;
-	struct coh901318_desc *cohd;
-	unsigned long flg;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int lli_len;
-	u32 ctrl_last = cohc_chan_param(cohc)->ctrl_lli_last;
-	int ret;
-
-	spin_lock_irqsave(&cohc->lock, flg);
-
-	dev_vdbg(COHC_2_DEV(cohc),
-		 "[%s] channel %d src %pad dest %pad size %zu\n",
-		 __func__, cohc->id, &src, &dest, size);
-
-	if (flags & DMA_PREP_INTERRUPT)
-		/* Trigger interrupt after last lli */
-		ctrl_last |= COH901318_CX_CTRL_TC_IRQ_ENABLE;
-
-	lli_len = size >> MAX_DMA_PACKET_SIZE_SHIFT;
-	if ((lli_len << MAX_DMA_PACKET_SIZE_SHIFT) < size)
-		lli_len++;
-
-	lli = coh901318_lli_alloc(&cohc->base->pool, lli_len);
-
-	if (lli == NULL)
-		goto err;
-
-	ret = coh901318_lli_fill_memcpy(
-		&cohc->base->pool, lli, src, size, dest,
-		cohc_chan_param(cohc)->ctrl_lli_chained,
-		ctrl_last);
-	if (ret)
-		goto err;
-
-	COH_DBG(coh901318_list_print(cohc, lli));
-
-	/* Pick a descriptor to handle this transfer */
-	cohd = coh901318_desc_get(cohc);
-	cohd->lli = lli;
-	cohd->flags = flags;
-	cohd->desc.tx_submit = coh901318_tx_submit;
-
-	spin_unlock_irqrestore(&cohc->lock, flg);
-
-	return &cohd->desc;
- err:
-	spin_unlock_irqrestore(&cohc->lock, flg);
-	return NULL;
-}
-
-static struct dma_async_tx_descriptor *
-coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-			unsigned int sg_len, enum dma_transfer_direction direction,
-			unsigned long flags, void *context)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_lli *lli;
-	struct coh901318_desc *cohd;
-	const struct coh901318_params *params;
-	struct scatterlist *sg;
-	int len = 0;
-	int size;
-	int i;
-	u32 ctrl_chained = cohc_chan_param(cohc)->ctrl_lli_chained;
-	u32 ctrl = cohc_chan_param(cohc)->ctrl_lli;
-	u32 ctrl_last = cohc_chan_param(cohc)->ctrl_lli_last;
-	u32 config;
-	unsigned long flg;
-	int ret;
-
-	if (!sgl)
-		goto out;
-	if (sg_dma_len(sgl) == 0)
-		goto out;
-
-	spin_lock_irqsave(&cohc->lock, flg);
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] sg_len %d dir %d\n",
-		 __func__, sg_len, direction);
-
-	if (flags & DMA_PREP_INTERRUPT)
-		/* Trigger interrupt after last lli */
-		ctrl_last |= COH901318_CX_CTRL_TC_IRQ_ENABLE;
-
-	params = cohc_chan_param(cohc);
-	config = params->config;
-	/*
-	 * Add runtime-specific control on top, make
-	 * sure the bits you set per peripheral channel are
-	 * cleared in the default config from the platform.
-	 */
-	ctrl_chained |= cohc->ctrl;
-	ctrl_last |= cohc->ctrl;
-	ctrl |= cohc->ctrl;
-
-	if (direction == DMA_MEM_TO_DEV) {
-		u32 tx_flags = COH901318_CX_CTRL_PRDD_SOURCE |
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE;
-
-		config |= COH901318_CX_CFG_RM_MEMORY_TO_PRIMARY;
-		ctrl_chained |= tx_flags;
-		ctrl_last |= tx_flags;
-		ctrl |= tx_flags;
-	} else if (direction == DMA_DEV_TO_MEM) {
-		u32 rx_flags = COH901318_CX_CTRL_PRDD_DEST |
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE;
-
-		config |= COH901318_CX_CFG_RM_PRIMARY_TO_MEMORY;
-		ctrl_chained |= rx_flags;
-		ctrl_last |= rx_flags;
-		ctrl |= rx_flags;
-	} else
-		goto err_direction;
-
-	/* The dma only supports transmitting packages up to
-	 * MAX_DMA_PACKET_SIZE. Calculate to total number of
-	 * dma elemts required to send the entire sg list
-	 */
-	for_each_sg(sgl, sg, sg_len, i) {
-		unsigned int factor;
-		size = sg_dma_len(sg);
-
-		if (size <= MAX_DMA_PACKET_SIZE) {
-			len++;
-			continue;
-		}
-
-		factor = size >> MAX_DMA_PACKET_SIZE_SHIFT;
-		if ((factor << MAX_DMA_PACKET_SIZE_SHIFT) < size)
-			factor++;
-
-		len += factor;
-	}
-
-	pr_debug("Allocate %d lli:s for this transfer\n", len);
-	lli = coh901318_lli_alloc(&cohc->base->pool, len);
-
-	if (lli == NULL)
-		goto err_dma_alloc;
-
-	coh901318_dma_set_runtimeconfig(chan, &cohc->config, direction);
-
-	/* initiate allocated lli list */
-	ret = coh901318_lli_fill_sg(&cohc->base->pool, lli, sgl, sg_len,
-				    cohc->addr,
-				    ctrl_chained,
-				    ctrl,
-				    ctrl_last,
-				    direction, COH901318_CX_CTRL_TC_IRQ_ENABLE);
-	if (ret)
-		goto err_lli_fill;
-
-
-	COH_DBG(coh901318_list_print(cohc, lli));
-
-	/* Pick a descriptor to handle this transfer */
-	cohd = coh901318_desc_get(cohc);
-	cohd->head_config = config;
-	/*
-	 * Set the default head ctrl for the channel to the one from the
-	 * lli, things may have changed due to odd buffer alignment
-	 * etc.
-	 */
-	cohd->head_ctrl = lli->control;
-	cohd->dir = direction;
-	cohd->flags = flags;
-	cohd->desc.tx_submit = coh901318_tx_submit;
-	cohd->lli = lli;
-
-	spin_unlock_irqrestore(&cohc->lock, flg);
-
-	return &cohd->desc;
- err_lli_fill:
- err_dma_alloc:
- err_direction:
-	spin_unlock_irqrestore(&cohc->lock, flg);
- out:
-	return NULL;
-}
-
-static enum dma_status
-coh901318_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
-		 struct dma_tx_state *txstate)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	enum dma_status ret;
-
-	ret = dma_cookie_status(chan, cookie, txstate);
-	if (ret == DMA_COMPLETE || !txstate)
-		return ret;
-
-	dma_set_residue(txstate, coh901318_get_bytes_left(chan));
-
-	if (ret == DMA_IN_PROGRESS && cohc->stopped)
-		ret = DMA_PAUSED;
-
-	return ret;
-}
-
-static void
-coh901318_issue_pending(struct dma_chan *chan)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	unsigned long flags;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * Busy means that pending jobs are already being processed,
-	 * and then there is no point in starting the queue: the
-	 * terminal count interrupt on the channel will take the next
-	 * job on the queue and execute it anyway.
-	 */
-	if (!cohc->busy)
-		coh901318_queue_start(cohc);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-}
-
-/*
- * Here we wrap in the runtime dma control interface
- */
-struct burst_table {
-	int burst_8bit;
-	int burst_16bit;
-	int burst_32bit;
-	u32 reg;
-};
-
-static const struct burst_table burst_sizes[] = {
-	{
-		.burst_8bit = 64,
-		.burst_16bit = 32,
-		.burst_32bit = 16,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_64_BYTES,
-	},
-	{
-		.burst_8bit = 48,
-		.burst_16bit = 24,
-		.burst_32bit = 12,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_48_BYTES,
-	},
-	{
-		.burst_8bit = 32,
-		.burst_16bit = 16,
-		.burst_32bit = 8,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_32_BYTES,
-	},
-	{
-		.burst_8bit = 16,
-		.burst_16bit = 8,
-		.burst_32bit = 4,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_16_BYTES,
-	},
-	{
-		.burst_8bit = 8,
-		.burst_16bit = 4,
-		.burst_32bit = 2,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_8_BYTES,
-	},
-	{
-		.burst_8bit = 4,
-		.burst_16bit = 2,
-		.burst_32bit = 1,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_4_BYTES,
-	},
-	{
-		.burst_8bit = 2,
-		.burst_16bit = 1,
-		.burst_32bit = 0,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_2_BYTES,
-	},
-	{
-		.burst_8bit = 1,
-		.burst_16bit = 0,
-		.burst_32bit = 0,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_1_BYTE,
-	},
-};
-
-static int coh901318_dma_set_runtimeconfig(struct dma_chan *chan,
-					   struct dma_slave_config *config,
-					   enum dma_transfer_direction direction)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	dma_addr_t addr;
-	enum dma_slave_buswidth addr_width;
-	u32 maxburst;
-	u32 ctrl = 0;
-	int i = 0;
-
-	/* We only support mem to per or per to mem transfers */
-	if (direction == DMA_DEV_TO_MEM) {
-		addr = config->src_addr;
-		addr_width = config->src_addr_width;
-		maxburst = config->src_maxburst;
-	} else if (direction == DMA_MEM_TO_DEV) {
-		addr = config->dst_addr;
-		addr_width = config->dst_addr_width;
-		maxburst = config->dst_maxburst;
-	} else {
-		dev_err(COHC_2_DEV(cohc), "illegal channel mode\n");
-		return -EINVAL;
-	}
-
-	dev_dbg(COHC_2_DEV(cohc), "configure channel for %d byte transfers\n",
-		addr_width);
-	switch (addr_width)  {
-	case DMA_SLAVE_BUSWIDTH_1_BYTE:
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_8_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_8_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_8bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	case DMA_SLAVE_BUSWIDTH_2_BYTES:
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_16_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_16_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_16bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	case DMA_SLAVE_BUSWIDTH_4_BYTES:
-		/* Direction doesn't matter here, it's 32/32 bits */
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_32bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	default:
-		dev_err(COHC_2_DEV(cohc),
-			"bad runtimeconfig: alien address width\n");
-		return -EINVAL;
-	}
-
-	ctrl |= burst_sizes[i].reg;
-	dev_dbg(COHC_2_DEV(cohc),
-		"selected burst size %d bytes for address width %d bytes, maxburst %d\n",
-		burst_sizes[i].burst_8bit, addr_width, maxburst);
-
-	cohc->addr = addr;
-	cohc->ctrl = ctrl;
-
-	return 0;
-}
-
-static int coh901318_dma_slave_config(struct dma_chan *chan,
-					   struct dma_slave_config *config)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-
-	memcpy(&cohc->config, config, sizeof(*config));
-
-	return 0;
-}
-
-static void coh901318_base_init(struct dma_device *dma, const int *pick_chans,
-				struct coh901318_base *base)
-{
-	int chans_i;
-	int i = 0;
-	struct coh901318_chan *cohc;
-
-	INIT_LIST_HEAD(&dma->channels);
-
-	for (chans_i = 0; pick_chans[chans_i] != -1; chans_i += 2) {
-		for (i = pick_chans[chans_i]; i <= pick_chans[chans_i+1]; i++) {
-			cohc = &base->chans[i];
-
-			cohc->base = base;
-			cohc->chan.device = dma;
-			cohc->id = i;
-
-			/* TODO: do we really need this lock if only one
-			 * client is connected to each channel?
-			 */
-
-			spin_lock_init(&cohc->lock);
-
-			cohc->nbr_active_done = 0;
-			cohc->busy = 0;
-			INIT_LIST_HEAD(&cohc->free);
-			INIT_LIST_HEAD(&cohc->active);
-			INIT_LIST_HEAD(&cohc->queue);
-
-			tasklet_setup(&cohc->tasklet, dma_tasklet);
-
-			list_add_tail(&cohc->chan.device_node,
-				      &dma->channels);
-		}
-	}
-}
-
-static int __init coh901318_probe(struct platform_device *pdev)
-{
-	int err = 0;
-	struct coh901318_base *base;
-	int irq;
-	struct resource *io;
-
-	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!io)
-		return -ENODEV;
-
-	/* Map DMA controller registers to virtual memory */
-	if (devm_request_mem_region(&pdev->dev,
-				    io->start,
-				    resource_size(io),
-				    pdev->dev.driver->name) == NULL)
-		return -ENOMEM;
-
-	base = devm_kzalloc(&pdev->dev,
-			    ALIGN(sizeof(struct coh901318_base), 4) +
-			    U300_DMA_CHANNELS *
-			    sizeof(struct coh901318_chan),
-			    GFP_KERNEL);
-	if (!base)
-		return -ENOMEM;
-
-	base->chans = ((void *)base) + ALIGN(sizeof(struct coh901318_base), 4);
-
-	base->virtbase = devm_ioremap(&pdev->dev, io->start, resource_size(io));
-	if (!base->virtbase)
-		return -ENOMEM;
-
-	base->dev = &pdev->dev;
-	spin_lock_init(&base->pm.lock);
-	base->pm.started_channels = 0;
-
-	COH901318_DEBUGFS_ASSIGN(debugfs_dma_base, base);
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
-
-	err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, 0,
-			       "coh901318", base);
-	if (err)
-		return err;
-
-	base->irq = irq;
-
-	err = coh901318_pool_create(&base->pool, &pdev->dev,
-				    sizeof(struct coh901318_lli),
-				    32);
-	if (err)
-		return err;
-
-	/* init channels for device transfers */
-	coh901318_base_init(&base->dma_slave, dma_slave_channels,
-			    base);
-
-	dma_cap_zero(base->dma_slave.cap_mask);
-	dma_cap_set(DMA_SLAVE, base->dma_slave.cap_mask);
-
-	base->dma_slave.device_alloc_chan_resources = coh901318_alloc_chan_resources;
-	base->dma_slave.device_free_chan_resources = coh901318_free_chan_resources;
-	base->dma_slave.device_prep_slave_sg = coh901318_prep_slave_sg;
-	base->dma_slave.device_tx_status = coh901318_tx_status;
-	base->dma_slave.device_issue_pending = coh901318_issue_pending;
-	base->dma_slave.device_config = coh901318_dma_slave_config;
-	base->dma_slave.device_pause = coh901318_pause;
-	base->dma_slave.device_resume = coh901318_resume;
-	base->dma_slave.device_terminate_all = coh901318_terminate_all;
-	base->dma_slave.dev = &pdev->dev;
-
-	err = dma_async_device_register(&base->dma_slave);
-
-	if (err)
-		goto err_register_slave;
-
-	/* init channels for memcpy */
-	coh901318_base_init(&base->dma_memcpy, dma_memcpy_channels,
-			    base);
-
-	dma_cap_zero(base->dma_memcpy.cap_mask);
-	dma_cap_set(DMA_MEMCPY, base->dma_memcpy.cap_mask);
-
-	base->dma_memcpy.device_alloc_chan_resources = coh901318_alloc_chan_resources;
-	base->dma_memcpy.device_free_chan_resources = coh901318_free_chan_resources;
-	base->dma_memcpy.device_prep_dma_memcpy = coh901318_prep_memcpy;
-	base->dma_memcpy.device_tx_status = coh901318_tx_status;
-	base->dma_memcpy.device_issue_pending = coh901318_issue_pending;
-	base->dma_memcpy.device_config = coh901318_dma_slave_config;
-	base->dma_memcpy.device_pause = coh901318_pause;
-	base->dma_memcpy.device_resume = coh901318_resume;
-	base->dma_memcpy.device_terminate_all = coh901318_terminate_all;
-	base->dma_memcpy.dev = &pdev->dev;
-	/*
-	 * This controller can only access address at even 32bit boundaries,
-	 * i.e. 2^2
-	 */
-	base->dma_memcpy.copy_align = DMAENGINE_ALIGN_4_BYTES;
-	err = dma_async_device_register(&base->dma_memcpy);
-
-	if (err)
-		goto err_register_memcpy;
-
-	err = of_dma_controller_register(pdev->dev.of_node, coh901318_xlate,
-					 base);
-	if (err)
-		goto err_register_of_dma;
-
-	platform_set_drvdata(pdev, base);
-	dev_info(&pdev->dev, "Initialized COH901318 DMA on virtual base 0x%p\n",
-		base->virtbase);
-
-	return err;
-
- err_register_of_dma:
-	dma_async_device_unregister(&base->dma_memcpy);
- err_register_memcpy:
-	dma_async_device_unregister(&base->dma_slave);
- err_register_slave:
-	coh901318_pool_destroy(&base->pool);
-	return err;
-}
-static void coh901318_base_remove(struct coh901318_base *base, const int *pick_chans)
-{
-	int chans_i;
-	int i = 0;
-	struct coh901318_chan *cohc;
-
-	for (chans_i = 0; pick_chans[chans_i] != -1; chans_i += 2) {
-		for (i = pick_chans[chans_i]; i <= pick_chans[chans_i+1]; i++) {
-			cohc = &base->chans[i];
-
-			tasklet_kill(&cohc->tasklet);
-		}
-	}
-
-}
-
-static int coh901318_remove(struct platform_device *pdev)
-{
-	struct coh901318_base *base = platform_get_drvdata(pdev);
-
-	devm_free_irq(&pdev->dev, base->irq, base);
-
-	coh901318_base_remove(base, dma_slave_channels);
-	coh901318_base_remove(base, dma_memcpy_channels);
-
-	of_dma_controller_free(pdev->dev.of_node);
-	dma_async_device_unregister(&base->dma_memcpy);
-	dma_async_device_unregister(&base->dma_slave);
-	coh901318_pool_destroy(&base->pool);
-	return 0;
-}
-
-static const struct of_device_id coh901318_dt_match[] = {
-	{ .compatible = "stericsson,coh901318" },
-	{},
-};
-
-static struct platform_driver coh901318_driver = {
-	.remove = coh901318_remove,
-	.driver = {
-		.name	= "coh901318",
-		.of_match_table = coh901318_dt_match,
-	},
-};
-
-static int __init coh901318_init(void)
-{
-	return platform_driver_probe(&coh901318_driver, coh901318_probe);
-}
-subsys_initcall(coh901318_init);
-
-static void __exit coh901318_exit(void)
-{
-	platform_driver_unregister(&coh901318_driver);
-}
-module_exit(coh901318_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Per Friden");
diff --git a/drivers/dma/coh901318.h b/drivers/dma/coh901318.h
deleted file mode 100644
index bbf533600558..000000000000
--- a/drivers/dma/coh901318.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2007-2013 ST-Ericsson
- * DMA driver for COH 901 318
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#ifndef COH901318_H
-#define COH901318_H
-
-#define MAX_DMA_PACKET_SIZE_SHIFT 11
-#define MAX_DMA_PACKET_SIZE (1 << MAX_DMA_PACKET_SIZE_SHIFT)
-
-struct device;
-
-struct coh901318_pool {
-	spinlock_t lock;
-	struct dma_pool *dmapool;
-	struct device *dev;
-
-#ifdef CONFIG_DEBUG_FS
-	int debugfs_pool_counter;
-#endif
-};
-
-/**
- * struct coh901318_lli - linked list item for DMAC
- * @control: control settings for DMAC
- * @src_addr: transfer source address
- * @dst_addr: transfer destination address
- * @link_addr:  physical address to next lli
- * @virt_link_addr: virtual address of next lli (only used by pool_free)
- * @phy_this: physical address of current lli (only used by pool_free)
- */
-struct coh901318_lli {
-	u32 control;
-	dma_addr_t src_addr;
-	dma_addr_t dst_addr;
-	dma_addr_t link_addr;
-
-	void *virt_link_addr;
-	dma_addr_t phy_this;
-};
-
-/**
- * coh901318_pool_create() - Creates an dma pool for lli:s
- * @pool: pool handle
- * @dev: dma device
- * @lli_nbr: number of lli:s in the pool
- * @algin: address alignemtn of lli:s
- * returns 0 on success otherwise none zero
- */
-int coh901318_pool_create(struct coh901318_pool *pool,
-			  struct device *dev,
-			  size_t lli_nbr, size_t align);
-
-/**
- * coh901318_pool_destroy() - Destroys the dma pool
- * @pool: pool handle
- * returns 0 on success otherwise none zero
- */
-int coh901318_pool_destroy(struct coh901318_pool *pool);
-
-/**
- * coh901318_lli_alloc() - Allocates a linked list
- *
- * @pool: pool handle
- * @len: length to list
- * return: none NULL if success otherwise NULL
- */
-struct coh901318_lli *
-coh901318_lli_alloc(struct coh901318_pool *pool,
-		    unsigned int len);
-
-/**
- * coh901318_lli_free() - Returns the linked list items to the pool
- * @pool: pool handle
- * @lli: reference to lli pointer to be freed
- */
-void coh901318_lli_free(struct coh901318_pool *pool,
-			struct coh901318_lli **lli);
-
-/**
- * coh901318_lli_fill_memcpy() - Prepares the lli:s for dma memcpy
- * @pool: pool handle
- * @lli: allocated lli
- * @src: src address
- * @size: transfer size
- * @dst: destination address
- * @ctrl_chained: ctrl for chained lli
- * @ctrl_last: ctrl for the last lli
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_memcpy(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t src, unsigned int size,
-			  dma_addr_t dst, u32 ctrl_chained, u32 ctrl_last);
-
-/**
- * coh901318_lli_fill_single() - Prepares the lli:s for dma single transfer
- * @pool: pool handle
- * @lli: allocated lli
- * @buf: transfer buffer
- * @size: transfer size
- * @dev_addr: address of periphal
- * @ctrl_chained: ctrl for chained lli
- * @ctrl_last: ctrl for the last lli
- * @dir: direction of transfer (to or from device)
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_single(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t buf, unsigned int size,
-			  dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_last,
-			  enum dma_transfer_direction dir);
-
-/**
- * coh901318_lli_fill_single() - Prepares the lli:s for dma scatter list transfer
- * @pool: pool handle
- * @lli: allocated lli
- * @sg: scatter gather list
- * @nents: number of entries in sg
- * @dev_addr: address of periphal
- * @ctrl_chained: ctrl for chained lli
- * @ctrl: ctrl of middle lli
- * @ctrl_last: ctrl for the last lli
- * @dir: direction of transfer (to or from device)
- * @ctrl_irq_mask: ctrl mask for CPU interrupt
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_sg(struct coh901318_pool *pool,
-		      struct coh901318_lli *lli,
-		      struct scatterlist *sg, unsigned int nents,
-		      dma_addr_t dev_addr, u32 ctrl_chained,
-		      u32 ctrl, u32 ctrl_last,
-		      enum dma_transfer_direction dir, u32 ctrl_irq_mask);
-
-#endif /* COH901318_H */
diff --git a/drivers/dma/coh901318_lli.c b/drivers/dma/coh901318_lli.c
deleted file mode 100644
index 6b6c2fd0865a..000000000000
--- a/drivers/dma/coh901318_lli.c
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * driver/dma/coh901318_lli.c
- *
- * Copyright (C) 2007-2009 ST-Ericsson
- * Support functions for handling lli for dma
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#include <linux/spinlock.h>
-#include <linux/memory.h>
-#include <linux/gfp.h>
-#include <linux/dmapool.h>
-#include <linux/dmaengine.h>
-
-#include "coh901318.h"
-
-#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_U300_DEBUG))
-#define DEBUGFS_POOL_COUNTER_RESET(pool) (pool->debugfs_pool_counter = 0)
-#define DEBUGFS_POOL_COUNTER_ADD(pool, add) (pool->debugfs_pool_counter += add)
-#else
-#define DEBUGFS_POOL_COUNTER_RESET(pool)
-#define DEBUGFS_POOL_COUNTER_ADD(pool, add)
-#endif
-
-static struct coh901318_lli *
-coh901318_lli_next(struct coh901318_lli *data)
-{
-	if (data == NULL || data->link_addr == 0)
-		return NULL;
-
-	return (struct coh901318_lli *) data->virt_link_addr;
-}
-
-int coh901318_pool_create(struct coh901318_pool *pool,
-			  struct device *dev,
-			  size_t size, size_t align)
-{
-	spin_lock_init(&pool->lock);
-	pool->dev = dev;
-	pool->dmapool = dma_pool_create("lli_pool", dev, size, align, 0);
-
-	DEBUGFS_POOL_COUNTER_RESET(pool);
-	return 0;
-}
-
-int coh901318_pool_destroy(struct coh901318_pool *pool)
-{
-
-	dma_pool_destroy(pool->dmapool);
-	return 0;
-}
-
-struct coh901318_lli *
-coh901318_lli_alloc(struct coh901318_pool *pool, unsigned int len)
-{
-	int i;
-	struct coh901318_lli *head;
-	struct coh901318_lli *lli;
-	struct coh901318_lli *lli_prev;
-	dma_addr_t phy;
-
-	if (len == 0)
-		return NULL;
-
-	spin_lock(&pool->lock);
-
-	head = dma_pool_alloc(pool->dmapool, GFP_NOWAIT, &phy);
-
-	if (head == NULL)
-		goto err;
-
-	DEBUGFS_POOL_COUNTER_ADD(pool, 1);
-
-	lli = head;
-	lli->phy_this = phy;
-	lli->link_addr = 0x00000000;
-	lli->virt_link_addr = NULL;
-
-	for (i = 1; i < len; i++) {
-		lli_prev = lli;
-
-		lli = dma_pool_alloc(pool->dmapool, GFP_NOWAIT, &phy);
-
-		if (lli == NULL)
-			goto err_clean_up;
-
-		DEBUGFS_POOL_COUNTER_ADD(pool, 1);
-		lli->phy_this = phy;
-		lli->link_addr = 0x00000000;
-		lli->virt_link_addr = NULL;
-
-		lli_prev->link_addr = phy;
-		lli_prev->virt_link_addr = lli;
-	}
-
-	spin_unlock(&pool->lock);
-
-	return head;
-
- err:
-	spin_unlock(&pool->lock);
-	return NULL;
-
- err_clean_up:
-	lli_prev->link_addr = 0x00000000U;
-	spin_unlock(&pool->lock);
-	coh901318_lli_free(pool, &head);
-	return NULL;
-}
-
-void coh901318_lli_free(struct coh901318_pool *pool,
-			struct coh901318_lli **lli)
-{
-	struct coh901318_lli *l;
-	struct coh901318_lli *next;
-
-	if (lli == NULL)
-		return;
-
-	l = *lli;
-
-	if (l == NULL)
-		return;
-
-	spin_lock(&pool->lock);
-
-	while (l->link_addr) {
-		next = l->virt_link_addr;
-		dma_pool_free(pool->dmapool, l, l->phy_this);
-		DEBUGFS_POOL_COUNTER_ADD(pool, -1);
-		l = next;
-	}
-	dma_pool_free(pool->dmapool, l, l->phy_this);
-	DEBUGFS_POOL_COUNTER_ADD(pool, -1);
-
-	spin_unlock(&pool->lock);
-	*lli = NULL;
-}
-
-int
-coh901318_lli_fill_memcpy(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t source, unsigned int size,
-			  dma_addr_t destination, u32 ctrl_chained,
-			  u32 ctrl_eom)
-{
-	int s = size;
-	dma_addr_t src = source;
-	dma_addr_t dst = destination;
-
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	while (lli->link_addr) {
-		lli->control = ctrl_chained | MAX_DMA_PACKET_SIZE;
-		lli->src_addr = src;
-		lli->dst_addr = dst;
-
-		s -= MAX_DMA_PACKET_SIZE;
-		lli = coh901318_lli_next(lli);
-
-		src += MAX_DMA_PACKET_SIZE;
-		dst += MAX_DMA_PACKET_SIZE;
-	}
-
-	lli->control = ctrl_eom | s;
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	return 0;
-}
-
-int
-coh901318_lli_fill_single(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t buf, unsigned int size,
-			  dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_eom,
-			  enum dma_transfer_direction dir)
-{
-	int s = size;
-	dma_addr_t src;
-	dma_addr_t dst;
-
-
-	if (dir == DMA_MEM_TO_DEV) {
-		src = buf;
-		dst = dev_addr;
-
-	} else if (dir == DMA_DEV_TO_MEM) {
-
-		src = dev_addr;
-		dst = buf;
-	} else {
-		return -EINVAL;
-	}
-
-	while (lli->link_addr) {
-		size_t block_size = MAX_DMA_PACKET_SIZE;
-		lli->control = ctrl_chained | MAX_DMA_PACKET_SIZE;
-
-		/* If we are on the next-to-final block and there will
-		 * be less than half a DMA packet left for the last
-		 * block, then we want to make this block a little
-		 * smaller to balance the sizes. This is meant to
-		 * avoid too small transfers if the buffer size is
-		 * (MAX_DMA_PACKET_SIZE*N + 1) */
-		if (s < (MAX_DMA_PACKET_SIZE + MAX_DMA_PACKET_SIZE/2))
-			block_size = MAX_DMA_PACKET_SIZE/2;
-
-		s -= block_size;
-		lli->src_addr = src;
-		lli->dst_addr = dst;
-
-		lli = coh901318_lli_next(lli);
-
-		if (dir == DMA_MEM_TO_DEV)
-			src += block_size;
-		else if (dir == DMA_DEV_TO_MEM)
-			dst += block_size;
-	}
-
-	lli->control = ctrl_eom | s;
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	return 0;
-}
-
-int
-coh901318_lli_fill_sg(struct coh901318_pool *pool,
-		      struct coh901318_lli *lli,
-		      struct scatterlist *sgl, unsigned int nents,
-		      dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl,
-		      u32 ctrl_last,
-		      enum dma_transfer_direction dir, u32 ctrl_irq_mask)
-{
-	int i;
-	struct scatterlist *sg;
-	u32 ctrl_sg;
-	dma_addr_t src = 0;
-	dma_addr_t dst = 0;
-	u32 bytes_to_transfer;
-	u32 elem_size;
-
-	if (lli == NULL)
-		goto err;
-
-	spin_lock(&pool->lock);
-
-	if (dir == DMA_MEM_TO_DEV)
-		dst = dev_addr;
-	else if (dir == DMA_DEV_TO_MEM)
-		src = dev_addr;
-	else
-		goto err;
-
-	for_each_sg(sgl, sg, nents, i) {
-		if (sg_is_chain(sg)) {
-			/* sg continues to the next sg-element don't
-			 * send ctrl_finish until the last
-			 * sg-element in the chain
-			 */
-			ctrl_sg = ctrl_chained;
-		} else if (i == nents - 1)
-			ctrl_sg = ctrl_last;
-		else
-			ctrl_sg = ctrl ? ctrl : ctrl_last;
-
-
-		if (dir == DMA_MEM_TO_DEV)
-			/* increment source address */
-			src = sg_dma_address(sg);
-		else
-			/* increment destination address */
-			dst = sg_dma_address(sg);
-
-		bytes_to_transfer = sg_dma_len(sg);
-
-		while (bytes_to_transfer) {
-			u32 val;
-
-			if (bytes_to_transfer > MAX_DMA_PACKET_SIZE) {
-				elem_size = MAX_DMA_PACKET_SIZE;
-				val = ctrl_chained;
-			} else {
-				elem_size = bytes_to_transfer;
-				val = ctrl_sg;
-			}
-
-			lli->control = val | elem_size;
-			lli->src_addr = src;
-			lli->dst_addr = dst;
-
-			if (dir == DMA_DEV_TO_MEM)
-				dst += elem_size;
-			else
-				src += elem_size;
-
-			BUG_ON(lli->link_addr & 3);
-
-			bytes_to_transfer -= elem_size;
-			lli = coh901318_lli_next(lli);
-		}
-
-	}
-	spin_unlock(&pool->lock);
-
-	return 0;
- err:
-	spin_unlock(&pool->lock);
-	return -EINVAL;
-}
diff --git a/include/linux/platform_data/dma-coh901318.h b/include/linux/platform_data/dma-coh901318.h
deleted file mode 100644
index 4cca529f8d56..000000000000
--- a/include/linux/platform_data/dma-coh901318.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Platform data for the COH901318 DMA controller
- * Copyright (C) 2007-2013 ST-Ericsson
- */
-
-#ifndef PLAT_COH901318_H
-#define PLAT_COH901318_H
-
-#ifdef CONFIG_COH901318
-
-/* We only support the U300 DMA channels */
-#define U300_DMA_MSL_TX_0		0
-#define U300_DMA_MSL_TX_1		1
-#define U300_DMA_MSL_TX_2		2
-#define U300_DMA_MSL_TX_3		3
-#define U300_DMA_MSL_TX_4		4
-#define U300_DMA_MSL_TX_5		5
-#define U300_DMA_MSL_TX_6		6
-#define U300_DMA_MSL_RX_0		7
-#define U300_DMA_MSL_RX_1		8
-#define U300_DMA_MSL_RX_2		9
-#define U300_DMA_MSL_RX_3		10
-#define U300_DMA_MSL_RX_4		11
-#define U300_DMA_MSL_RX_5		12
-#define U300_DMA_MSL_RX_6		13
-#define U300_DMA_MMCSD_RX_TX		14
-#define U300_DMA_MSPRO_TX		15
-#define U300_DMA_MSPRO_RX		16
-#define U300_DMA_UART0_TX		17
-#define U300_DMA_UART0_RX		18
-#define U300_DMA_APEX_TX		19
-#define U300_DMA_APEX_RX		20
-#define U300_DMA_PCM_I2S0_TX		21
-#define U300_DMA_PCM_I2S0_RX		22
-#define U300_DMA_PCM_I2S1_TX		23
-#define U300_DMA_PCM_I2S1_RX		24
-#define U300_DMA_XGAM_CDI		25
-#define U300_DMA_XGAM_PDI		26
-#define U300_DMA_SPI_TX			27
-#define U300_DMA_SPI_RX			28
-#define U300_DMA_GENERAL_PURPOSE_0	29
-#define U300_DMA_GENERAL_PURPOSE_1	30
-#define U300_DMA_GENERAL_PURPOSE_2	31
-#define U300_DMA_GENERAL_PURPOSE_3	32
-#define U300_DMA_GENERAL_PURPOSE_4	33
-#define U300_DMA_GENERAL_PURPOSE_5	34
-#define U300_DMA_GENERAL_PURPOSE_6	35
-#define U300_DMA_GENERAL_PURPOSE_7	36
-#define U300_DMA_GENERAL_PURPOSE_8	37
-#define U300_DMA_UART1_TX		38
-#define U300_DMA_UART1_RX		39
-
-#define U300_DMA_DEVICE_CHANNELS	32
-#define U300_DMA_CHANNELS		40
-
-/**
- * coh901318_filter_id() - DMA channel filter function
- * @chan: dma channel handle
- * @chan_id: id of dma channel to be filter out
- *
- * In dma_request_channel() it specifies what channel id to be requested
- */
-bool coh901318_filter_id(struct dma_chan *chan, void *chan_id);
-#else
-static inline bool coh901318_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	return false;
-}
-#endif
-
-#endif /* PLAT_COH901318_H */
-- 
cgit v1.2.3


From e247f85a9bf6af9ce6bc36d86ac242f782ae0947 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Thu, 21 Jan 2021 12:03:54 +0100
Subject: dmaengine: mmp_pdma: Remove mmp_pdma_filter_fn()

It's not used anywhere -- drop it.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Link: https://lore.kernel.org/r/20210121110356.1768635-2-lkundrak@v3.sk
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/mmp_pdma.c       | 14 --------------
 include/linux/dma/mmp-pdma.h | 16 ----------------
 2 files changed, 30 deletions(-)
 delete mode 100644 include/linux/dma/mmp-pdma.h

(limited to 'include')

diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index b84303be8edf..89f1814ff27a 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -18,7 +18,6 @@
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/of.h>
-#include <linux/dma/mmp-pdma.h>
 
 #include "dmaengine.h"
 
@@ -1148,19 +1147,6 @@ static struct platform_driver mmp_pdma_driver = {
 	.remove		= mmp_pdma_remove,
 };
 
-bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param)
-{
-	struct mmp_pdma_chan *c = to_mmp_pdma_chan(chan);
-
-	if (chan->device->dev->driver != &mmp_pdma_driver.driver)
-		return false;
-
-	c->drcmr = *(unsigned int *)param;
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(mmp_pdma_filter_fn);
-
 module_platform_driver(mmp_pdma_driver);
 
 MODULE_DESCRIPTION("MARVELL MMP Peripheral DMA Driver");
diff --git a/include/linux/dma/mmp-pdma.h b/include/linux/dma/mmp-pdma.h
deleted file mode 100644
index 25cab62a28c4..000000000000
--- a/include/linux/dma/mmp-pdma.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _MMP_PDMA_H_
-#define _MMP_PDMA_H_
-
-struct dma_chan;
-
-#ifdef CONFIG_MMP_PDMA
-bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param);
-#else
-static inline bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param)
-{
-	return false;
-}
-#endif
-
-#endif /* _MMP_PDMA_H_ */
-- 
cgit v1.2.3


From 7eecea89e44f64e60f1410483e79a0cab18ad580 Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Wed, 20 Jan 2021 08:33:40 -0800
Subject: VMCI: Enforce queuepair max size for IOCTL_VMCI_QUEUEPAIR_ALLOC

When create the VMCI queue pair tracking data structures on the host
side, the IOCTL for creating the VMCI queue pair didn't validate
the queue pair size parameters. This change adds checks for this.

This avoids a memory allocation issue in qp_host_alloc_queue, as
reported by nslusarek@gmx.net. The check in qp_host_alloc_queue
has also been updated to enforce the maximum queue pair size
as defined by VMCI_MAX_GUEST_QP_MEMORY.

The fix has been verified using sample code supplied by
nslusarek@gmx.net.

Reported-by: nslusarek@gmx.net
Reviewed-by: Vishnu Dasa <vdasa@vmware.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Link: https://lore.kernel.org/r/1611160420-30573-1-git-send-email-jhansen@vmware.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 12 ++++++++----
 include/linux/vmw_vmci_defs.h           |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 525ef96d3a07..d787ddecee77 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -237,7 +237,9 @@ static struct qp_list qp_guest_endpoints = {
 #define QPE_NUM_PAGES(_QPE) ((u32) \
 			     (DIV_ROUND_UP(_QPE.produce_size, PAGE_SIZE) + \
 			      DIV_ROUND_UP(_QPE.consume_size, PAGE_SIZE) + 2))
-
+#define QP_SIZES_ARE_VALID(_prod_qsize, _cons_qsize) \
+	((_prod_qsize) + (_cons_qsize) >= max(_prod_qsize, _cons_qsize) && \
+	 (_prod_qsize) + (_cons_qsize) <= VMCI_MAX_GUEST_QP_MEMORY)
 
 /*
  * Frees kernel VA space for a given queue and its queue header, and
@@ -528,7 +530,7 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size)
 	u64 num_pages;
 	const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if));
 
-	if (size > SIZE_MAX - PAGE_SIZE)
+	if (size > min_t(size_t, VMCI_MAX_GUEST_QP_MEMORY, SIZE_MAX - PAGE_SIZE))
 		return NULL;
 	num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
 	if (num_pages > (SIZE_MAX - queue_size) /
@@ -1929,6 +1931,9 @@ int vmci_qp_broker_alloc(struct vmci_handle handle,
 			 struct vmci_qp_page_store *page_store,
 			 struct vmci_ctx *context)
 {
+	if (!QP_SIZES_ARE_VALID(produce_size, consume_size))
+		return VMCI_ERROR_NO_RESOURCES;
+
 	return qp_broker_alloc(handle, peer, flags, priv_flags,
 			       produce_size, consume_size,
 			       page_store, context, NULL, NULL, NULL, NULL);
@@ -2685,8 +2690,7 @@ int vmci_qpair_alloc(struct vmci_qp **qpair,
 	 * used by the device is NO_RESOURCES, so use that here too.
 	 */
 
-	if (produce_qsize + consume_qsize < max(produce_qsize, consume_qsize) ||
-	    produce_qsize + consume_qsize > VMCI_MAX_GUEST_QP_MEMORY)
+	if (!QP_SIZES_ARE_VALID(produce_qsize, consume_qsize))
 		return VMCI_ERROR_NO_RESOURCES;
 
 	retval = vmci_route(&src, &dst, false, &route);
diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index be0afe6f379b..e36cb114c188 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -66,7 +66,7 @@ enum {
  * consists of at least two pages, the memory limit also dictates the
  * number of queue pairs a guest can create.
  */
-#define VMCI_MAX_GUEST_QP_MEMORY (128 * 1024 * 1024)
+#define VMCI_MAX_GUEST_QP_MEMORY ((size_t)(128 * 1024 * 1024))
 #define VMCI_MAX_GUEST_QP_COUNT  (VMCI_MAX_GUEST_QP_MEMORY / PAGE_SIZE / 2)
 
 /*
@@ -80,7 +80,7 @@ enum {
  * too much kernel memory (especially on vmkernel).  We limit a queuepair to
  * 32 KB, or 16 KB per queue for symmetrical pairs.
  */
-#define VMCI_MAX_PINNED_QP_MEMORY (32 * 1024)
+#define VMCI_MAX_PINNED_QP_MEMORY ((size_t)(32 * 1024))
 
 /*
  * We have a fixed set of resource IDs available in the VMX.
-- 
cgit v1.2.3


From 0fc99422bc034de018607ef6b70f92d4bc4a236d Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Mon, 18 Jan 2021 09:48:56 +0100
Subject: firmware: xilinx: Remove PM_API_MAX value

There is no reason to keep PM_API_MAX around. The commit acfdd18591ea
("firmware: xilinx: Use hash-table for api feature check") removed its
usage that's why it is not used anywhere now.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/86e94593a29a1b5b1958c539a1bfabdd08c0948e.1610959734.git.michal.simek@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 2a0da841c942..0a483249ff30 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -85,7 +85,6 @@ enum pm_api_id {
 	PM_CLOCK_GETPARENT,
 	PM_SECURE_AES = 47,
 	PM_FEATURE_CHECK = 63,
-	PM_API_MAX,
 };
 
 /* PMU-FW return status codes */
-- 
cgit v1.2.3


From acda36189cb8df27ef073d391ebd67c64ac36cf9 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 27 Jan 2021 13:11:33 +0200
Subject: dt-bindings: interconnect: Add Qualcomm SDX55 DT bindings

The Qualcomm SDX55 platform has several bus fabrics that could be
controlled and tuned dynamically over RPMh according to the bandwidth
demand.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20210121053254.8355-2-manivannan.sadhasivam@linaro.org
Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
---
 .../bindings/interconnect/qcom,rpmh.yaml           |  4 ++
 include/dt-bindings/interconnect/qcom,sdx55.h      | 76 ++++++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 include/dt-bindings/interconnect/qcom,sdx55.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/interconnect/qcom,rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,rpmh.yaml
index 30c2a092d2d3..f9b150b817d8 100644
--- a/Documentation/devicetree/bindings/interconnect/qcom,rpmh.yaml
+++ b/Documentation/devicetree/bindings/interconnect/qcom,rpmh.yaml
@@ -45,6 +45,10 @@ properties:
       - qcom,sdm845-mem-noc
       - qcom,sdm845-mmss-noc
       - qcom,sdm845-system-noc
+      - qcom,sdx55-ipa-virt
+      - qcom,sdx55-mc-virt
+      - qcom,sdx55-mem-noc
+      - qcom,sdx55-system-noc
       - qcom,sm8150-aggre1-noc
       - qcom,sm8150-aggre2-noc
       - qcom,sm8150-camnoc-noc
diff --git a/include/dt-bindings/interconnect/qcom,sdx55.h b/include/dt-bindings/interconnect/qcom,sdx55.h
new file mode 100644
index 000000000000..bfb6524a2d90
--- /dev/null
+++ b/include/dt-bindings/interconnect/qcom,sdx55.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Qualcomm SDX55 interconnect IDs
+ *
+ * Copyright (c) 2021, Linaro Ltd.
+ * Author: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_SDX55_H
+#define __DT_BINDINGS_INTERCONNECT_QCOM_SDX55_H
+
+#define MASTER_LLCC			0
+#define SLAVE_EBI_CH0			1
+
+#define MASTER_TCU_0			0
+#define MASTER_SNOC_GC_MEM_NOC		1
+#define MASTER_AMPSS_M0			2
+#define SLAVE_LLCC			3
+#define SLAVE_MEM_NOC_SNOC		4
+#define SLAVE_MEM_NOC_PCIE_SNOC		5
+
+#define MASTER_AUDIO			0
+#define MASTER_BLSP_1			1
+#define MASTER_QDSS_BAM			2
+#define MASTER_QPIC			3
+#define MASTER_SNOC_CFG			4
+#define MASTER_SPMI_FETCHER		5
+#define MASTER_ANOC_SNOC		6
+#define MASTER_IPA			7
+#define MASTER_MEM_NOC_SNOC		8
+#define MASTER_MEM_NOC_PCIE_SNOC	9
+#define MASTER_CRYPTO_CORE_0		10
+#define MASTER_EMAC			11
+#define MASTER_IPA_PCIE			12
+#define MASTER_PCIE			13
+#define MASTER_QDSS_ETR			14
+#define MASTER_SDCC_1			15
+#define MASTER_USB3			16
+#define SLAVE_AOP			17
+#define SLAVE_AOSS			18
+#define SLAVE_APPSS			19
+#define SLAVE_AUDIO			20
+#define SLAVE_BLSP_1			21
+#define SLAVE_CLK_CTL			22
+#define SLAVE_CRYPTO_0_CFG		23
+#define SLAVE_CNOC_DDRSS		24
+#define SLAVE_ECC_CFG			25
+#define SLAVE_EMAC_CFG			26
+#define SLAVE_IMEM_CFG			27
+#define SLAVE_IPA_CFG			28
+#define SLAVE_CNOC_MSS			29
+#define SLAVE_PCIE_PARF			30
+#define SLAVE_PDM			31
+#define SLAVE_PRNG			32
+#define SLAVE_QDSS_CFG			33
+#define SLAVE_QPIC			34
+#define SLAVE_SDCC_1			35
+#define SLAVE_SNOC_CFG			36
+#define SLAVE_SPMI_FETCHER		37
+#define SLAVE_SPMI_VGI_COEX		38
+#define SLAVE_TCSR			39
+#define SLAVE_TLMM			40
+#define SLAVE_USB3			41
+#define SLAVE_USB3_PHY_CFG		42
+#define SLAVE_ANOC_SNOC			43
+#define SLAVE_SNOC_MEM_NOC_GC		44
+#define SLAVE_OCIMEM			45
+#define SLAVE_SERVICE_SNOC		46
+#define SLAVE_PCIE_0			47
+#define SLAVE_QDSS_STM			48
+#define SLAVE_TCU			49
+
+#define MASTER_IPA_CORE			0
+#define SLAVE_IPA_CORE			1
+
+#endif
-- 
cgit v1.2.3


From 8ba59e9dee31246fc34b4d4bec032093e9c06510 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 22 Jan 2021 13:43:58 +0200
Subject: misc: pti: Remove driver for deprecated platform

Intel Moorestown and Medfield are quite old Intel Atom based
32-bit platforms, which were in limited use in some Android phones,
tablets and consumer electronics more than eight years ago.

There are no bugs or problems ever reported outside from Intel
for breaking any of that platforms for years. It seems no real
users exists who run more or less fresh kernel on it. The commit
05f4434bc130 ("ASoC: Intel: remove mfld_machine") also in align
with this theory.

Due to above and to reduce a burden of supporting outdated drivers
we remove the support of outdated platforms completely.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210122114358.39299-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/pti_intel_mid.rst | 108 ----
 drivers/misc/Kconfig                       |  13 -
 drivers/misc/Makefile                      |   1 -
 drivers/misc/pti.c                         | 978 -----------------------------
 drivers/tty/Makefile                       |   2 -
 drivers/tty/n_tracerouter.c                | 233 -------
 drivers/tty/n_tracesink.c                  | 228 -------
 drivers/tty/n_tracesink.h                  |  26 -
 include/linux/intel-pti.h                  |  35 --
 9 files changed, 1624 deletions(-)
 delete mode 100644 Documentation/driver-api/pti_intel_mid.rst
 delete mode 100644 drivers/misc/pti.c
 delete mode 100644 drivers/tty/n_tracerouter.c
 delete mode 100644 drivers/tty/n_tracesink.c
 delete mode 100644 drivers/tty/n_tracesink.h
 delete mode 100644 include/linux/intel-pti.h

(limited to 'include')

diff --git a/Documentation/driver-api/pti_intel_mid.rst b/Documentation/driver-api/pti_intel_mid.rst
deleted file mode 100644
index bacc2a4ee89f..000000000000
--- a/Documentation/driver-api/pti_intel_mid.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=============
-Intel MID PTI
-=============
-
-The Intel MID PTI project is HW implemented in Intel Atom
-system-on-a-chip designs based on the Parallel Trace
-Interface for MIPI P1149.7 cJTAG standard.  The kernel solution
-for this platform involves the following files::
-
-	./include/linux/pti.h
-	./drivers/.../n_tracesink.h
-	./drivers/.../n_tracerouter.c
-	./drivers/.../n_tracesink.c
-	./drivers/.../pti.c
-
-pti.c is the driver that enables various debugging features
-popular on platforms from certain mobile manufacturers.
-n_tracerouter.c and n_tracesink.c allow extra system information to
-be collected and routed to the pti driver, such as trace
-debugging data from a modem.  Although n_tracerouter
-and n_tracesink are a part of the complete PTI solution,
-these two line disciplines can work separately from
-pti.c and route any data stream from one /dev/tty node
-to another /dev/tty node via kernel-space.  This provides
-a stable, reliable connection that will not break unless
-the user-space application shuts down (plus avoids
-kernel->user->kernel context switch overheads of routing
-data).
-
-An example debugging usage for this driver system:
-
-  * Hook /dev/ttyPTI0 to syslogd.  Opening this port will also start
-    a console device to further capture debugging messages to PTI.
-  * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
-    This is where n_tracerouter and n_tracesink are used.
-  * Hook /dev/pti to a user-level debugging application for writing
-    to PTI HW.
-  * `Use mipi_` Kernel Driver API in other device drivers for
-    debugging to PTI by first requesting a PTI write address via
-    mipi_request_masterchannel(1).
-
-Below is example pseudo-code on how a 'privileged' application
-can hook up n_tracerouter and n_tracesink to any tty on
-a system.  'Privileged' means the application has enough
-privileges to successfully manipulate the ldisc drivers
-but is not just blindly executing as 'root'. Keep in mind
-the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
-and n_tracesink line discpline drivers but is a generic
-operation for a program to use a line discpline driver
-on a tty port other than the default n_tty:
-
-.. code-block:: c
-
-  /////////// To hook up n_tracerouter and n_tracesink /////////
-
-  // Note that n_tracerouter depends on n_tracesink.
-  #include <errno.h>
-  #define ONE_TTY "/dev/ttyOne"
-  #define TWO_TTY "/dev/ttyTwo"
-
-  // needed global to hand onto ldisc connection
-  static int g_fd_source = -1;
-  static int g_fd_sink  = -1;
-
-  // these two vars used to grab LDISC values from loaded ldisc drivers
-  // in OS.  Look at /proc/tty/ldiscs to get the right numbers from
-  // the ldiscs loaded in the system.
-  int source_ldisc_num, sink_ldisc_num = -1;
-  int retval;
-
-  g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
-  g_fd_sink   = open(TWO_TTY, O_RDWR); // must be R/W
-
-  if (g_fd_source <= 0) || (g_fd_sink <= 0) {
-     // doubt you'll want to use these exact error lines of code
-     printf("Error on open(). errno: %d\n",errno);
-     return errno;
-  }
-
-  retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
-  if (retval < 0) {
-     printf("Error on ioctl().  errno: %d\n", errno);
-     return errno;
-  }
-
-  retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
-  if (retval < 0) {
-     printf("Error on ioctl().  errno: %d\n", errno);
-     return errno;
-  }
-
-  /////////// To disconnect n_tracerouter and n_tracesink ////////
-
-  // First make sure data through the ldiscs has stopped.
-
-  // Second, disconnect ldiscs.  This provides a
-  // little cleaner shutdown on tty stack.
-  sink_ldisc_num = 0;
-  source_ldisc_num = 0;
-  ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
-  ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
-
-  // Three, program closes connection, and cleanup:
-  close(g_fd_uart);
-  close(g_fd_gadget);
-  g_fd_uart = g_fd_gadget = NULL;
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 8085213913cc..f532c59bb59b 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -104,19 +104,6 @@ config PHANTOM
 	  If you choose to build module, its name will be phantom. If unsure,
 	  say N here.
 
-config INTEL_MID_PTI
-	tristate "Parallel Trace Interface for MIPI P1149.7 cJTAG standard"
-	depends on PCI && TTY && (X86_INTEL_MID || COMPILE_TEST)
-	help
-	  The PTI (Parallel Trace Interface) driver directs
-	  trace data routed from various parts in the system out
-	  through an Intel Penwell PTI port and out of the mobile
-	  device for analysis with a debugging tool (Lauterbach or Fido).
-
-	  You should select this driver if the target kernel is meant for
-	  an Intel Atom (non-netbook) mobile device containing a MIPI
-	  P1149.7 standard implementation.
-
 config TIFM_CORE
 	tristate "TI Flash Media interface support"
 	depends on PCI
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 456aa8a4c896..99b6f15a3c70 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_IBMVMC)		+= ibmvmc.o
 obj-$(CONFIG_AD525X_DPOT)	+= ad525x_dpot.o
 obj-$(CONFIG_AD525X_DPOT_I2C)	+= ad525x_dpot-i2c.o
 obj-$(CONFIG_AD525X_DPOT_SPI)	+= ad525x_dpot-spi.o
-obj-$(CONFIG_INTEL_MID_PTI)	+= pti.o
 obj-$(CONFIG_ATMEL_SSC)		+= atmel-ssc.o
 obj-$(CONFIG_DUMMY_IRQ)		+= dummy-irq.o
 obj-$(CONFIG_ICS932S401)	+= ics932s401.o
diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c
deleted file mode 100644
index 7236ae527b19..000000000000
--- a/drivers/misc/pti.c
+++ /dev/null
@@ -1,978 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  pti.c - PTI driver for cJTAG data extration
- *
- *  Copyright (C) Intel 2010
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/pci.h>
-#include <linux/mutex.h>
-#include <linux/miscdevice.h>
-#include <linux/intel-pti.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#define DRIVERNAME		"pti"
-#define PCINAME			"pciPTI"
-#define TTYNAME			"ttyPTI"
-#define CHARNAME		"pti"
-#define PTITTY_MINOR_START	0
-#define PTITTY_MINOR_NUM	2
-#define MAX_APP_IDS		16   /* 128 channel ids / u8 bit size */
-#define MAX_OS_IDS		16   /* 128 channel ids / u8 bit size */
-#define MAX_MODEM_IDS		16   /* 128 channel ids / u8 bit size */
-#define MODEM_BASE_ID		71   /* modem master ID address    */
-#define CONTROL_ID		72   /* control master ID address  */
-#define CONSOLE_ID		73   /* console master ID address  */
-#define OS_BASE_ID		74   /* base OS master ID address  */
-#define APP_BASE_ID		80   /* base App master ID address */
-#define CONTROL_FRAME_LEN	32   /* PTI control frame maximum size */
-#define USER_COPY_SIZE		8192 /* 8Kb buffer for user space copy */
-#define APERTURE_14		0x3800000 /* offset to first OS write addr */
-#define APERTURE_LEN		0x400000  /* address length */
-
-struct pti_tty {
-	struct pti_masterchannel *mc;
-};
-
-struct pti_dev {
-	struct tty_port port[PTITTY_MINOR_NUM];
-	unsigned long pti_addr;
-	unsigned long aperture_base;
-	void __iomem *pti_ioaddr;
-	u8 ia_app[MAX_APP_IDS];
-	u8 ia_os[MAX_OS_IDS];
-	u8 ia_modem[MAX_MODEM_IDS];
-};
-
-/*
- * This protects access to ia_app, ia_os, and ia_modem,
- * which keeps track of channels allocated in
- * an aperture write id.
- */
-static DEFINE_MUTEX(alloclock);
-
-static const struct pci_device_id pci_ids[] = {
-		{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x82B)},
-		{0}
-};
-
-static struct tty_driver *pti_tty_driver;
-static struct pti_dev *drv_data;
-
-static unsigned int pti_console_channel;
-static unsigned int pti_control_channel;
-
-/**
- *  pti_write_to_aperture()- The private write function to PTI HW.
- *
- *  @mc: The 'aperture'. It's part of a write address that holds
- *       a master and channel ID.
- *  @buf: Data being written to the HW that will ultimately be seen
- *        in a debugging tool (Fido, Lauterbach).
- *  @len: Size of buffer.
- *
- *  Since each aperture is specified by a unique
- *  master/channel ID, no two processes will be writing
- *  to the same aperture at the same time so no lock is required. The
- *  PTI-Output agent will send these out in the order that they arrived, and
- *  thus, it will intermix these messages. The debug tool can then later
- *  regroup the appropriate message segments together reconstituting each
- *  message.
- */
-static void pti_write_to_aperture(struct pti_masterchannel *mc,
-				  u8 *buf,
-				  int len)
-{
-	int dwordcnt;
-	int final;
-	int i;
-	u32 ptiword;
-	u32 __iomem *aperture;
-	u8 *p = buf;
-
-	/*
-	 * calculate the aperture offset from the base using the master and
-	 * channel id's.
-	 */
-	aperture = drv_data->pti_ioaddr + (mc->master << 15)
-		+ (mc->channel << 8);
-
-	dwordcnt = len >> 2;
-	final = len - (dwordcnt << 2);	    /* final = trailing bytes    */
-	if (final == 0 && dwordcnt != 0) {  /* always need a final dword */
-		final += 4;
-		dwordcnt--;
-	}
-
-	for (i = 0; i < dwordcnt; i++) {
-		ptiword = be32_to_cpu(*(u32 *)p);
-		p += 4;
-		iowrite32(ptiword, aperture);
-	}
-
-	aperture += PTI_LASTDWORD_DTS;	/* adding DTS signals that is EOM */
-
-	ptiword = 0;
-	for (i = 0; i < final; i++)
-		ptiword |= *p++ << (24-(8*i));
-
-	iowrite32(ptiword, aperture);
-	return;
-}
-
-/**
- *  pti_control_frame_built_and_sent()- control frame build and send function.
- *
- *  @mc:          The master / channel structure on which the function
- *                built a control frame.
- *  @thread_name: The thread name associated with the master / channel or
- *                'NULL' if using the 'current' global variable.
- *
- *  To be able to post process the PTI contents on host side, a control frame
- *  is added before sending any PTI content. So the host side knows on
- *  each PTI frame the name of the thread using a dedicated master / channel.
- *  The thread name is retrieved from 'current' global variable if 'thread_name'
- *  is 'NULL', else it is retrieved from 'thread_name' parameter.
- *  This function builds this frame and sends it to a master ID CONTROL_ID.
- *  The overhead is only 32 bytes since the driver only writes to HW
- *  in 32 byte chunks.
- */
-static void pti_control_frame_built_and_sent(struct pti_masterchannel *mc,
-					     const char *thread_name)
-{
-	/*
-	 * Since we access the comm member in current's task_struct, we only
-	 * need to be as large as what 'comm' in that structure is.
-	 */
-	char comm[TASK_COMM_LEN];
-	struct pti_masterchannel mccontrol = {.master = CONTROL_ID,
-					      .channel = 0};
-	const char *thread_name_p;
-	const char *control_format = "%3d %3d %s";
-	u8 control_frame[CONTROL_FRAME_LEN];
-
-	if (!thread_name) {
-		if (!in_interrupt())
-			get_task_comm(comm, current);
-		else
-			strncpy(comm, "Interrupt", TASK_COMM_LEN);
-
-		/* Absolutely ensure our buffer is zero terminated. */
-		comm[TASK_COMM_LEN-1] = 0;
-		thread_name_p = comm;
-	} else {
-		thread_name_p = thread_name;
-	}
-
-	mccontrol.channel = pti_control_channel;
-	pti_control_channel = (pti_control_channel + 1) & 0x7f;
-
-	snprintf(control_frame, CONTROL_FRAME_LEN, control_format, mc->master,
-		mc->channel, thread_name_p);
-	pti_write_to_aperture(&mccontrol, control_frame, strlen(control_frame));
-}
-
-/**
- *  pti_write_full_frame_to_aperture()- high level function to
- *					write to PTI.
- *
- *  @mc:  The 'aperture'. It's part of a write address that holds
- *        a master and channel ID.
- *  @buf: Data being written to the HW that will ultimately be seen
- *        in a debugging tool (Fido, Lauterbach).
- *  @len: Size of buffer.
- *
- *  All threads sending data (either console, user space application, ...)
- *  are calling the high level function to write to PTI meaning that it is
- *  possible to add a control frame before sending the content.
- */
-static void pti_write_full_frame_to_aperture(struct pti_masterchannel *mc,
-						const unsigned char *buf,
-						int len)
-{
-	pti_control_frame_built_and_sent(mc, NULL);
-	pti_write_to_aperture(mc, (u8 *)buf, len);
-}
-
-/**
- * get_id()- Allocate a master and channel ID.
- *
- * @id_array:    an array of bits representing what channel
- *               id's are allocated for writing.
- * @max_ids:     The max amount of available write IDs to use.
- * @base_id:     The starting SW channel ID, based on the Intel
- *               PTI arch.
- * @thread_name: The thread name associated with the master / channel or
- *               'NULL' if using the 'current' global variable.
- *
- * Returns:
- *	pti_masterchannel struct with master, channel ID address
- *	0 for error
- *
- * Each bit in the arrays ia_app and ia_os correspond to a master and
- * channel id. The bit is one if the id is taken and 0 if free. For
- * every master there are 128 channel id's.
- */
-static struct pti_masterchannel *get_id(u8 *id_array,
-					int max_ids,
-					int base_id,
-					const char *thread_name)
-{
-	struct pti_masterchannel *mc;
-	int i, j, mask;
-
-	mc = kmalloc(sizeof(struct pti_masterchannel), GFP_KERNEL);
-	if (mc == NULL)
-		return NULL;
-
-	/* look for a byte with a free bit */
-	for (i = 0; i < max_ids; i++)
-		if (id_array[i] != 0xff)
-			break;
-	if (i == max_ids) {
-		kfree(mc);
-		return NULL;
-	}
-	/* find the bit in the 128 possible channel opportunities */
-	mask = 0x80;
-	for (j = 0; j < 8; j++) {
-		if ((id_array[i] & mask) == 0)
-			break;
-		mask >>= 1;
-	}
-
-	/* grab it */
-	id_array[i] |= mask;
-	mc->master  = base_id;
-	mc->channel = ((i & 0xf)<<3) + j;
-	/* write new master Id / channel Id allocation to channel control */
-	pti_control_frame_built_and_sent(mc, thread_name);
-	return mc;
-}
-
-/*
- * The following three functions:
- * pti_request_mastercahannel(), mipi_release_masterchannel()
- * and pti_writedata() are an API for other kernel drivers to
- * access PTI.
- */
-
-/**
- * pti_request_masterchannel()- Kernel API function used to allocate
- *				a master, channel ID address
- *				to write to PTI HW.
- *
- * @type:        0- request Application  master, channel aperture ID
- *                  write address.
- *               1- request OS master, channel aperture ID write
- *                  address.
- *               2- request Modem master, channel aperture ID
- *                  write address.
- *               Other values, error.
- * @thread_name: The thread name associated with the master / channel or
- *               'NULL' if using the 'current' global variable.
- *
- * Returns:
- *	pti_masterchannel struct
- *	0 for error
- */
-struct pti_masterchannel *pti_request_masterchannel(u8 type,
-						    const char *thread_name)
-{
-	struct pti_masterchannel *mc;
-
-	mutex_lock(&alloclock);
-
-	switch (type) {
-
-	case 0:
-		mc = get_id(drv_data->ia_app, MAX_APP_IDS,
-			    APP_BASE_ID, thread_name);
-		break;
-
-	case 1:
-		mc = get_id(drv_data->ia_os, MAX_OS_IDS,
-			    OS_BASE_ID, thread_name);
-		break;
-
-	case 2:
-		mc = get_id(drv_data->ia_modem, MAX_MODEM_IDS,
-			    MODEM_BASE_ID, thread_name);
-		break;
-	default:
-		mc = NULL;
-	}
-
-	mutex_unlock(&alloclock);
-	return mc;
-}
-EXPORT_SYMBOL_GPL(pti_request_masterchannel);
-
-/**
- * pti_release_masterchannel()- Kernel API function used to release
- *				a master, channel ID address
- *				used to write to PTI HW.
- *
- * @mc: master, channel apeture ID address to be released.  This
- *      will de-allocate the structure via kfree().
- */
-void pti_release_masterchannel(struct pti_masterchannel *mc)
-{
-	u8 master, channel, i;
-
-	mutex_lock(&alloclock);
-
-	if (mc) {
-		master = mc->master;
-		channel = mc->channel;
-
-		if (master == APP_BASE_ID) {
-			i = channel >> 3;
-			drv_data->ia_app[i] &=  ~(0x80>>(channel & 0x7));
-		} else if (master == OS_BASE_ID) {
-			i = channel >> 3;
-			drv_data->ia_os[i] &= ~(0x80>>(channel & 0x7));
-		} else {
-			i = channel >> 3;
-			drv_data->ia_modem[i] &= ~(0x80>>(channel & 0x7));
-		}
-
-		kfree(mc);
-	}
-
-	mutex_unlock(&alloclock);
-}
-EXPORT_SYMBOL_GPL(pti_release_masterchannel);
-
-/**
- * pti_writedata()- Kernel API function used to write trace
- *                  debugging data to PTI HW.
- *
- * @mc:    Master, channel aperture ID address to write to.
- *         Null value will return with no write occurring.
- * @buf:   Trace debuging data to write to the PTI HW.
- *         Null value will return with no write occurring.
- * @count: Size of buf. Value of 0 or a negative number will
- *         return with no write occuring.
- */
-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count)
-{
-	/*
-	 * since this function is exported, this is treated like an
-	 * API function, thus, all parameters should
-	 * be checked for validity.
-	 */
-	if ((mc != NULL) && (buf != NULL) && (count > 0))
-		pti_write_to_aperture(mc, buf, count);
-	return;
-}
-EXPORT_SYMBOL_GPL(pti_writedata);
-
-/*
- * for the tty_driver_*() basic function descriptions, see tty_driver.h.
- * Specific header comments made for PTI-related specifics.
- */
-
-/**
- * pti_tty_driver_open()- Open an Application master, channel aperture
- * ID to the PTI device via tty device.
- *
- * @tty: tty interface.
- * @filp: filp interface pased to tty_port_open() call.
- *
- * Returns:
- *	int, 0 for success
- *	otherwise, fail value
- *
- * The main purpose of using the tty device interface is for
- * each tty port to have a unique PTI write aperture.  In an
- * example use case, ttyPTI0 gets syslogd and an APP aperture
- * ID and ttyPTI1 is where the n_tracesink ldisc hooks to route
- * modem messages into PTI.  Modem trace data does not have to
- * go to ttyPTI1, but ttyPTI0 and ttyPTI1 do need to be distinct
- * master IDs.  These messages go through the PTI HW and out of
- * the handheld platform and to the Fido/Lauterbach device.
- */
-static int pti_tty_driver_open(struct tty_struct *tty, struct file *filp)
-{
-	/*
-	 * we actually want to allocate a new channel per open, per
-	 * system arch.  HW gives more than plenty channels for a single
-	 * system task to have its own channel to write trace data. This
-	 * also removes a locking requirement for the actual write
-	 * procedure.
-	 */
-	return tty_port_open(tty->port, tty, filp);
-}
-
-/**
- * pti_tty_driver_close()- close tty device and release Application
- * master, channel aperture ID to the PTI device via tty device.
- *
- * @tty: tty interface.
- * @filp: filp interface pased to tty_port_close() call.
- *
- * The main purpose of using the tty device interface is to route
- * syslog daemon messages to the PTI HW and out of the handheld platform
- * and to the Fido/Lauterbach device.
- */
-static void pti_tty_driver_close(struct tty_struct *tty, struct file *filp)
-{
-	tty_port_close(tty->port, tty, filp);
-}
-
-/**
- * pti_tty_install()- Used to set up specific master-channels
- *		      to tty ports for organizational purposes when
- *		      tracing viewed from debuging tools.
- *
- * @driver: tty driver information.
- * @tty: tty struct containing pti information.
- *
- * Returns:
- *	0 for success
- *	otherwise, error
- */
-static int pti_tty_install(struct tty_driver *driver, struct tty_struct *tty)
-{
-	int idx = tty->index;
-	struct pti_tty *pti_tty_data;
-	int ret = tty_standard_install(driver, tty);
-
-	if (ret == 0) {
-		pti_tty_data = kmalloc(sizeof(struct pti_tty), GFP_KERNEL);
-		if (pti_tty_data == NULL)
-			return -ENOMEM;
-
-		if (idx == PTITTY_MINOR_START)
-			pti_tty_data->mc = pti_request_masterchannel(0, NULL);
-		else
-			pti_tty_data->mc = pti_request_masterchannel(2, NULL);
-
-		if (pti_tty_data->mc == NULL) {
-			kfree(pti_tty_data);
-			return -ENXIO;
-		}
-		tty->driver_data = pti_tty_data;
-	}
-
-	return ret;
-}
-
-/**
- * pti_tty_cleanup()- Used to de-allocate master-channel resources
- *		      tied to tty's of this driver.
- *
- * @tty: tty struct containing pti information.
- */
-static void pti_tty_cleanup(struct tty_struct *tty)
-{
-	struct pti_tty *pti_tty_data = tty->driver_data;
-	if (pti_tty_data == NULL)
-		return;
-	pti_release_masterchannel(pti_tty_data->mc);
-	kfree(pti_tty_data);
-	tty->driver_data = NULL;
-}
-
-/**
- * pti_tty_driver_write()-  Write trace debugging data through the char
- * interface to the PTI HW.  Part of the misc device implementation.
- *
- * @tty: tty struct containing pti information.
- * @buf: trace data to be written.
- * @len:  # of byte to write.
- *
- * Returns:
- *	int, # of bytes written
- *	otherwise, error
- */
-static int pti_tty_driver_write(struct tty_struct *tty,
-	const unsigned char *buf, int len)
-{
-	struct pti_tty *pti_tty_data = tty->driver_data;
-	if ((pti_tty_data != NULL) && (pti_tty_data->mc != NULL)) {
-		pti_write_to_aperture(pti_tty_data->mc, (u8 *)buf, len);
-		return len;
-	}
-	/*
-	 * we can't write to the pti hardware if the private driver_data
-	 * and the mc address is not there.
-	 */
-	else
-		return -EFAULT;
-}
-
-/**
- * pti_tty_write_room()- Always returns 2048.
- *
- * @tty: contains tty info of the pti driver.
- */
-static int pti_tty_write_room(struct tty_struct *tty)
-{
-	return 2048;
-}
-
-/**
- * pti_char_open()- Open an Application master, channel aperture
- * ID to the PTI device. Part of the misc device implementation.
- *
- * @inode: not used.
- * @filp:  Output- will have a masterchannel struct set containing
- *                 the allocated application PTI aperture write address.
- *
- * Returns:
- *	int, 0 for success
- *	otherwise, a fail value
- */
-static int pti_char_open(struct inode *inode, struct file *filp)
-{
-	struct pti_masterchannel *mc;
-
-	/*
-	 * We really do want to fail immediately if
-	 * pti_request_masterchannel() fails,
-	 * before assigning the value to filp->private_data.
-	 * Slightly easier to debug if this driver needs debugging.
-	 */
-	mc = pti_request_masterchannel(0, NULL);
-	if (mc == NULL)
-		return -ENOMEM;
-	filp->private_data = mc;
-	return 0;
-}
-
-/**
- * pti_char_release()-  Close a char channel to the PTI device. Part
- * of the misc device implementation.
- *
- * @inode: Not used in this implementaiton.
- * @filp:  Contains private_data that contains the master, channel
- *         ID to be released by the PTI device.
- *
- * Returns:
- *	always 0
- */
-static int pti_char_release(struct inode *inode, struct file *filp)
-{
-	pti_release_masterchannel(filp->private_data);
-	filp->private_data = NULL;
-	return 0;
-}
-
-/**
- * pti_char_write()-  Write trace debugging data through the char
- * interface to the PTI HW.  Part of the misc device implementation.
- *
- * @filp:  Contains private data which is used to obtain
- *         master, channel write ID.
- * @data:  trace data to be written.
- * @len:   # of byte to write.
- * @ppose: Not used in this function implementation.
- *
- * Returns:
- *	int, # of bytes written
- *	otherwise, error value
- *
- * Notes: From side discussions with Alan Cox and experimenting
- * with PTI debug HW like Nokia's Fido box and Lauterbach
- * devices, 8192 byte write buffer used by USER_COPY_SIZE was
- * deemed an appropriate size for this type of usage with
- * debugging HW.
- */
-static ssize_t pti_char_write(struct file *filp, const char __user *data,
-			      size_t len, loff_t *ppose)
-{
-	struct pti_masterchannel *mc;
-	void *kbuf;
-	const char __user *tmp;
-	size_t size = USER_COPY_SIZE;
-	size_t n = 0;
-
-	tmp = data;
-	mc = filp->private_data;
-
-	kbuf = kmalloc(size, GFP_KERNEL);
-	if (kbuf == NULL)  {
-		pr_err("%s(%d): buf allocation failed\n",
-			__func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	do {
-		if (len - n > USER_COPY_SIZE)
-			size = USER_COPY_SIZE;
-		else
-			size = len - n;
-
-		if (copy_from_user(kbuf, tmp, size)) {
-			kfree(kbuf);
-			return n ? n : -EFAULT;
-		}
-
-		pti_write_to_aperture(mc, kbuf, size);
-		n  += size;
-		tmp += size;
-
-	} while (len > n);
-
-	kfree(kbuf);
-	return len;
-}
-
-static const struct tty_operations pti_tty_driver_ops = {
-	.open		= pti_tty_driver_open,
-	.close		= pti_tty_driver_close,
-	.write		= pti_tty_driver_write,
-	.write_room	= pti_tty_write_room,
-	.install	= pti_tty_install,
-	.cleanup	= pti_tty_cleanup
-};
-
-static const struct file_operations pti_char_driver_ops = {
-	.owner		= THIS_MODULE,
-	.write		= pti_char_write,
-	.open		= pti_char_open,
-	.release	= pti_char_release,
-};
-
-static struct miscdevice pti_char_driver = {
-	.minor		= MISC_DYNAMIC_MINOR,
-	.name		= CHARNAME,
-	.fops		= &pti_char_driver_ops
-};
-
-/**
- * pti_console_write()-  Write to the console that has been acquired.
- *
- * @c:   Not used in this implementaiton.
- * @buf: Data to be written.
- * @len: Length of buf.
- */
-static void pti_console_write(struct console *c, const char *buf, unsigned len)
-{
-	static struct pti_masterchannel mc = {.master  = CONSOLE_ID,
-					      .channel = 0};
-
-	mc.channel = pti_console_channel;
-	pti_console_channel = (pti_console_channel + 1) & 0x7f;
-
-	pti_write_full_frame_to_aperture(&mc, buf, len);
-}
-
-/**
- * pti_console_device()-  Return the driver tty structure and set the
- *			  associated index implementation.
- *
- * @c:     Console device of the driver.
- * @index: index associated with c.
- *
- * Returns:
- *	always value of pti_tty_driver structure when this function
- *	is called.
- */
-static struct tty_driver *pti_console_device(struct console *c, int *index)
-{
-	*index = c->index;
-	return pti_tty_driver;
-}
-
-/**
- * pti_console_setup()-  Initialize console variables used by the driver.
- *
- * @c:     Not used.
- * @opts:  Not used.
- *
- * Returns:
- *	always 0.
- */
-static int pti_console_setup(struct console *c, char *opts)
-{
-	pti_console_channel = 0;
-	pti_control_channel = 0;
-	return 0;
-}
-
-/*
- * pti_console struct, used to capture OS printk()'s and shift
- * out to the PTI device for debugging.  This cannot be
- * enabled upon boot because of the possibility of eating
- * any serial console printk's (race condition discovered).
- * The console should be enabled upon when the tty port is
- * used for the first time.  Since the primary purpose for
- * the tty port is to hook up syslog to it, the tty port
- * will be open for a really long time.
- */
-static struct console pti_console = {
-	.name		= TTYNAME,
-	.write		= pti_console_write,
-	.device		= pti_console_device,
-	.setup		= pti_console_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= 0,
-};
-
-/**
- * pti_port_activate()- Used to start/initialize any items upon
- * first opening of tty_port().
- *
- * @port: The tty port number of the PTI device.
- * @tty:  The tty struct associated with this device.
- *
- * Returns:
- *	always returns 0
- *
- * Notes: The primary purpose of the PTI tty port 0 is to hook
- * the syslog daemon to it; thus this port will be open for a
- * very long time.
- */
-static int pti_port_activate(struct tty_port *port, struct tty_struct *tty)
-{
-	if (port->tty->index == PTITTY_MINOR_START)
-		console_start(&pti_console);
-	return 0;
-}
-
-/**
- * pti_port_shutdown()- Used to stop/shutdown any items upon the
- * last tty port close.
- *
- * @port: The tty port number of the PTI device.
- *
- * Notes: The primary purpose of the PTI tty port 0 is to hook
- * the syslog daemon to it; thus this port will be open for a
- * very long time.
- */
-static void pti_port_shutdown(struct tty_port *port)
-{
-	if (port->tty->index == PTITTY_MINOR_START)
-		console_stop(&pti_console);
-}
-
-static const struct tty_port_operations tty_port_ops = {
-	.activate = pti_port_activate,
-	.shutdown = pti_port_shutdown,
-};
-
-/*
- * Note the _probe() call sets everything up and ties the char and tty
- * to successfully detecting the PTI device on the pci bus.
- */
-
-/**
- * pti_pci_probe()- Used to detect pti on the pci bus and set
- *		    things up in the driver.
- *
- * @pdev: pci_dev struct values for pti.
- * @ent:  pci_device_id struct for pti driver.
- *
- * Returns:
- *	0 for success
- *	otherwise, error
- */
-static int pti_pci_probe(struct pci_dev *pdev,
-		const struct pci_device_id *ent)
-{
-	unsigned int a;
-	int retval;
-	int pci_bar = 1;
-
-	dev_dbg(&pdev->dev, "%s %s(%d): PTI PCI ID %04x:%04x\n", __FILE__,
-			__func__, __LINE__, pdev->vendor, pdev->device);
-
-	retval = misc_register(&pti_char_driver);
-	if (retval) {
-		pr_err("%s(%d): CHAR registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-		goto err;
-	}
-
-	retval = pci_enable_device(pdev);
-	if (retval != 0) {
-		dev_err(&pdev->dev,
-			"%s: pci_enable_device() returned error %d\n",
-			__func__, retval);
-		goto err_unreg_misc;
-	}
-
-	drv_data = kzalloc(sizeof(*drv_data), GFP_KERNEL);
-	if (drv_data == NULL) {
-		retval = -ENOMEM;
-		dev_err(&pdev->dev,
-			"%s(%d): kmalloc() returned NULL memory.\n",
-			__func__, __LINE__);
-		goto err_disable_pci;
-	}
-	drv_data->pti_addr = pci_resource_start(pdev, pci_bar);
-
-	retval = pci_request_region(pdev, pci_bar, dev_name(&pdev->dev));
-	if (retval != 0) {
-		dev_err(&pdev->dev,
-			"%s(%d): pci_request_region() returned error %d\n",
-			__func__, __LINE__, retval);
-		goto err_free_dd;
-	}
-	drv_data->aperture_base = drv_data->pti_addr+APERTURE_14;
-	drv_data->pti_ioaddr =
-		ioremap((u32)drv_data->aperture_base,
-		APERTURE_LEN);
-	if (!drv_data->pti_ioaddr) {
-		retval = -ENOMEM;
-		goto err_rel_reg;
-	}
-
-	pci_set_drvdata(pdev, drv_data);
-
-	for (a = 0; a < PTITTY_MINOR_NUM; a++) {
-		struct tty_port *port = &drv_data->port[a];
-		tty_port_init(port);
-		port->ops = &tty_port_ops;
-
-		tty_port_register_device(port, pti_tty_driver, a, &pdev->dev);
-	}
-
-	register_console(&pti_console);
-
-	return 0;
-err_rel_reg:
-	pci_release_region(pdev, pci_bar);
-err_free_dd:
-	kfree(drv_data);
-err_disable_pci:
-	pci_disable_device(pdev);
-err_unreg_misc:
-	misc_deregister(&pti_char_driver);
-err:
-	return retval;
-}
-
-/**
- * pti_pci_remove()- Driver exit method to remove PTI from
- *		   PCI bus.
- * @pdev: variable containing pci info of PTI.
- */
-static void pti_pci_remove(struct pci_dev *pdev)
-{
-	struct pti_dev *drv_data = pci_get_drvdata(pdev);
-	unsigned int a;
-
-	unregister_console(&pti_console);
-
-	for (a = 0; a < PTITTY_MINOR_NUM; a++) {
-		tty_unregister_device(pti_tty_driver, a);
-		tty_port_destroy(&drv_data->port[a]);
-	}
-
-	iounmap(drv_data->pti_ioaddr);
-	kfree(drv_data);
-	pci_release_region(pdev, 1);
-	pci_disable_device(pdev);
-
-	misc_deregister(&pti_char_driver);
-}
-
-static struct pci_driver pti_pci_driver = {
-	.name		= PCINAME,
-	.id_table	= pci_ids,
-	.probe		= pti_pci_probe,
-	.remove		= pti_pci_remove,
-};
-
-/**
- * pti_init()- Overall entry/init call to the pti driver.
- *             It starts the registration process with the kernel.
- *
- * Returns:
- *	int __init, 0 for success
- *	otherwise value is an error
- *
- */
-static int __init pti_init(void)
-{
-	int retval;
-
-	/* First register module as tty device */
-
-	pti_tty_driver = alloc_tty_driver(PTITTY_MINOR_NUM);
-	if (pti_tty_driver == NULL) {
-		pr_err("%s(%d): Memory allocation failed for ptiTTY driver\n",
-			__func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	pti_tty_driver->driver_name		= DRIVERNAME;
-	pti_tty_driver->name			= TTYNAME;
-	pti_tty_driver->major			= 0;
-	pti_tty_driver->minor_start		= PTITTY_MINOR_START;
-	pti_tty_driver->type			= TTY_DRIVER_TYPE_SYSTEM;
-	pti_tty_driver->subtype			= SYSTEM_TYPE_SYSCONS;
-	pti_tty_driver->flags			= TTY_DRIVER_REAL_RAW |
-						  TTY_DRIVER_DYNAMIC_DEV;
-	pti_tty_driver->init_termios		= tty_std_termios;
-
-	tty_set_operations(pti_tty_driver, &pti_tty_driver_ops);
-
-	retval = tty_register_driver(pti_tty_driver);
-	if (retval) {
-		pr_err("%s(%d): TTY registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-
-		goto put_tty;
-	}
-
-	retval = pci_register_driver(&pti_pci_driver);
-	if (retval) {
-		pr_err("%s(%d): PCI registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-		goto unreg_tty;
-	}
-
-	return 0;
-unreg_tty:
-	tty_unregister_driver(pti_tty_driver);
-put_tty:
-	put_tty_driver(pti_tty_driver);
-	pti_tty_driver = NULL;
-	return retval;
-}
-
-/**
- * pti_exit()- Unregisters this module as a tty and pci driver.
- */
-static void __exit pti_exit(void)
-{
-	tty_unregister_driver(pti_tty_driver);
-	pci_unregister_driver(&pti_pci_driver);
-	put_tty_driver(pti_tty_driver);
-}
-
-module_init(pti_init);
-module_exit(pti_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ken Mills, Jay Freyensee");
-MODULE_DESCRIPTION("PTI Driver");
-
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index b3ccae932660..730de6bf048b 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_AUDIT)		+= tty_audit.o
 obj-$(CONFIG_MAGIC_SYSRQ)	+= sysrq.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_N_GSM)		+= n_gsm.o
-obj-$(CONFIG_TRACE_ROUTER)	+= n_tracerouter.o
-obj-$(CONFIG_TRACE_SINK)	+= n_tracesink.o
 obj-$(CONFIG_R3964)		+= n_r3964.o
 
 obj-y				+= vt/
diff --git a/drivers/tty/n_tracerouter.c b/drivers/tty/n_tracerouter.c
deleted file mode 100644
index 4479af4d2fa5..000000000000
--- a/drivers/tty/n_tracerouter.c
+++ /dev/null
@@ -1,233 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  n_tracerouter.c - Trace data router through tty space
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This trace router uses the Linux line discipline framework to route
- * trace data coming from a HW Modem to a PTI (Parallel Trace Module) port.
- * The solution is not specific to a HW modem and this line disciple can
- * be used to route any stream of data in kernel space.
- * This is part of a solution for the P1149.7, compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-#include <linux/tty_ldisc.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/bug.h>
-#include "n_tracesink.h"
-
-/*
- * Other ldisc drivers use 65536 which basically means,
- * 'I can always accept 64k' and flow control is off.
- * This number is deemed appropriate for this driver.
- */
-#define RECEIVE_ROOM	65536
-#define DRIVERNAME	"n_tracerouter"
-
-/*
- * struct to hold private configuration data for this ldisc.
- * opencalled is used to hold if this ldisc has been opened.
- * kref_tty holds the tty reference the ldisc sits on top of.
- */
-struct tracerouter_data {
-	u8 opencalled;
-	struct tty_struct *kref_tty;
-};
-static struct tracerouter_data *tr_data;
-
-/* lock for when tty reference is being used */
-static DEFINE_MUTEX(routelock);
-
-/**
- * n_tracerouter_open() - Called when a tty is opened by a SW entity.
- * @tty: terminal device to the ldisc.
- *
- * Return:
- *      0 for success.
- *
- * Caveats: This should only be opened one time per SW entity.
- */
-static int n_tracerouter_open(struct tty_struct *tty)
-{
-	int retval = -EEXIST;
-
-	mutex_lock(&routelock);
-	if (tr_data->opencalled == 0) {
-
-		tr_data->kref_tty = tty_kref_get(tty);
-		if (tr_data->kref_tty == NULL) {
-			retval = -EFAULT;
-		} else {
-			tr_data->opencalled = 1;
-			tty->disc_data      = tr_data;
-			tty->receive_room   = RECEIVE_ROOM;
-			tty_driver_flush_buffer(tty);
-			retval = 0;
-		}
-	}
-	mutex_unlock(&routelock);
-	return retval;
-}
-
-/**
- * n_tracerouter_close() - close connection
- * @tty: terminal device to the ldisc.
- *
- * Called when a software entity wants to close a connection.
- */
-static void n_tracerouter_close(struct tty_struct *tty)
-{
-	struct tracerouter_data *tptr = tty->disc_data;
-
-	mutex_lock(&routelock);
-	WARN_ON(tptr->kref_tty != tr_data->kref_tty);
-	tty_driver_flush_buffer(tty);
-	tty_kref_put(tr_data->kref_tty);
-	tr_data->kref_tty = NULL;
-	tr_data->opencalled = 0;
-	tty->disc_data = NULL;
-	mutex_unlock(&routelock);
-}
-
-/**
- * n_tracerouter_read() - read request from user space
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * function that allows read() functionality in userspace. By default if this
- * is not implemented it returns -EIO. This module is functioning like a
- * router via n_tracerouter_receivebuf(), and there is no real requirement
- * to implement this function. However, an error return value other than
- * -EIO should be used just to show that there was an intent not to have
- * this function implemented.  Return value based on read() man pages.
- *
- * Return:
- *	 -EINVAL
- */
-static ssize_t n_tracerouter_read(struct tty_struct *tty, struct file *file,
-				  unsigned char __user *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracerouter_write() - Function that allows write() in userspace.
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * By default if this is not implemented, it returns -EIO.
- * This should not be implemented, ever, because
- * 1. this driver is functioning like a router via
- *    n_tracerouter_receivebuf()
- * 2. No writes to HW will ever go through this line discpline driver.
- * However, an error return value other than -EIO should be used
- * just to show that there was an intent not to have this function
- * implemented.  Return value based on write() man pages.
- *
- * Return:
- *	-EINVAL
- */
-static ssize_t n_tracerouter_write(struct tty_struct *tty, struct file *file,
-				   const unsigned char *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracerouter_receivebuf() - Routing function for driver.
- * @tty: terminal device passed into the ldisc.  It's assumed
- *       tty will never be NULL.
- * @cp:  buffer, block of characters to be eventually read by
- *       someone, somewhere (user read() call or some kernel function).
- * @fp:  flag buffer.
- * @count: number of characters (aka, bytes) in cp.
- *
- * This function takes the input buffer, cp, and passes it to
- * an external API function for processing.
- */
-static void n_tracerouter_receivebuf(struct tty_struct *tty,
-					const unsigned char *cp,
-					char *fp, int count)
-{
-	mutex_lock(&routelock);
-	n_tracesink_datadrain((u8 *) cp, count);
-	mutex_unlock(&routelock);
-}
-
-/*
- * Flush buffer is not impelemented as the ldisc has no internal buffering
- * so the tty_driver_flush_buffer() is sufficient for this driver's needs.
- */
-
-static struct tty_ldisc_ops tty_ptirouter_ldisc = {
-	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
-	.name		= DRIVERNAME,
-	.open		= n_tracerouter_open,
-	.close		= n_tracerouter_close,
-	.read		= n_tracerouter_read,
-	.write		= n_tracerouter_write,
-	.receive_buf	= n_tracerouter_receivebuf
-};
-
-/**
- * n_tracerouter_init -	module initialisation
- *
- * Registers this module as a line discipline driver.
- *
- * Return:
- *	0 for success, any other value error.
- */
-static int __init n_tracerouter_init(void)
-{
-	int retval;
-
-	tr_data = kzalloc(sizeof(struct tracerouter_data), GFP_KERNEL);
-	if (tr_data == NULL)
-		return -ENOMEM;
-
-
-	/* Note N_TRACEROUTER is defined in linux/tty.h */
-	retval = tty_register_ldisc(N_TRACEROUTER, &tty_ptirouter_ldisc);
-	if (retval < 0) {
-		pr_err("%s: Registration failed: %d\n", __func__, retval);
-		kfree(tr_data);
-	}
-	return retval;
-}
-
-/**
- * n_tracerouter_exit -	module unload
- *
- * Removes this module as a line discipline driver.
- */
-static void __exit n_tracerouter_exit(void)
-{
-	int retval = tty_unregister_ldisc(N_TRACEROUTER);
-
-	if (retval < 0)
-		pr_err("%s: Unregistration failed: %d\n", __func__,  retval);
-	else
-		kfree(tr_data);
-}
-
-module_init(n_tracerouter_init);
-module_exit(n_tracerouter_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jay Freyensee");
-MODULE_ALIAS_LDISC(N_TRACEROUTER);
-MODULE_DESCRIPTION("Trace router ldisc driver");
diff --git a/drivers/tty/n_tracesink.c b/drivers/tty/n_tracesink.c
deleted file mode 100644
index d96ba82cc356..000000000000
--- a/drivers/tty/n_tracesink.c
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  n_tracesink.c - Trace data router and sink path through tty space.
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The trace sink uses the Linux line discipline framework to receive
- * trace data coming from the PTI source line discipline driver
- * to a user-desired tty port, like USB.
- * This is to provide a way to extract modem trace data on
- * devices that do not have a PTI HW module, or just need modem
- * trace data to come out of a different HW output port.
- * This is part of a solution for the P1149.7, compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-#include <linux/tty_ldisc.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/bug.h>
-#include "n_tracesink.h"
-
-/*
- * Other ldisc drivers use 65536 which basically means,
- * 'I can always accept 64k' and flow control is off.
- * This number is deemed appropriate for this driver.
- */
-#define RECEIVE_ROOM	65536
-#define DRIVERNAME	"n_tracesink"
-
-/*
- * there is a quirk with this ldisc is he can write data
- * to a tty from anyone calling his kernel API, which
- * meets customer requirements in the drivers/misc/pti.c
- * project.  So he needs to know when he can and cannot write when
- * the API is called. In theory, the API can be called
- * after an init() but before a successful open() which
- * would crash the system if tty is not checked.
- */
-static struct tty_struct *this_tty;
-static DEFINE_MUTEX(writelock);
-
-/**
- * n_tracesink_open() - Called when a tty is opened by a SW entity.
- * @tty: terminal device to the ldisc.
- *
- * Return:
- *      0 for success,
- *      -EFAULT = couldn't get a tty kref n_tracesink will sit
- *       on top of
- *      -EEXIST = open() called successfully once and it cannot
- *      be called again.
- *
- * Caveats: open() should only be successful the first time a
- * SW entity calls it.
- */
-static int n_tracesink_open(struct tty_struct *tty)
-{
-	int retval = -EEXIST;
-
-	mutex_lock(&writelock);
-	if (this_tty == NULL) {
-		this_tty = tty_kref_get(tty);
-		if (this_tty == NULL) {
-			retval = -EFAULT;
-		} else {
-			tty->disc_data = this_tty;
-			tty_driver_flush_buffer(tty);
-			retval = 0;
-		}
-	}
-	mutex_unlock(&writelock);
-
-	return retval;
-}
-
-/**
- * n_tracesink_close() - close connection
- * @tty: terminal device to the ldisc.
- *
- * Called when a software entity wants to close a connection.
- */
-static void n_tracesink_close(struct tty_struct *tty)
-{
-	mutex_lock(&writelock);
-	tty_driver_flush_buffer(tty);
-	tty_kref_put(this_tty);
-	this_tty = NULL;
-	tty->disc_data = NULL;
-	mutex_unlock(&writelock);
-}
-
-/**
- * n_tracesink_read() - read request from user space
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * function that allows read() functionality in userspace. By default if this
- * is not implemented it returns -EIO. This module is functioning like a
- * router via n_tracesink_receivebuf(), and there is no real requirement
- * to implement this function. However, an error return value other than
- * -EIO should be used just to show that there was an intent not to have
- * this function implemented.  Return value based on read() man pages.
- *
- * Return:
- *	 -EINVAL
- */
-static ssize_t n_tracesink_read(struct tty_struct *tty, struct file *file,
-				unsigned char __user *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracesink_write() - Function that allows write() in userspace.
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * By default if this is not implemented, it returns -EIO.
- * This should not be implemented, ever, because
- * 1. this driver is functioning like a router via
- *    n_tracesink_receivebuf()
- * 2. No writes to HW will ever go through this line discpline driver.
- * However, an error return value other than -EIO should be used
- * just to show that there was an intent not to have this function
- * implemented.  Return value based on write() man pages.
- *
- * Return:
- *	-EINVAL
- */
-static ssize_t n_tracesink_write(struct tty_struct *tty, struct file *file,
-				 const unsigned char *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracesink_datadrain() - Kernel API function used to route
- *			     trace debugging data to user-defined
- *			     port like USB.
- *
- * @buf:   Trace debuging data buffer to write to tty target
- *         port. Null value will return with no write occurring.
- * @count: Size of buf. Value of 0 or a negative number will
- *         return with no write occuring.
- *
- * Caveat: If this line discipline does not set the tty it sits
- * on top of via an open() call, this API function will not
- * call the tty's write() call because it will have no pointer
- * to call the write().
- */
-void n_tracesink_datadrain(u8 *buf, int count)
-{
-	mutex_lock(&writelock);
-
-	if ((buf != NULL) && (count > 0) && (this_tty != NULL))
-		this_tty->ops->write(this_tty, buf, count);
-
-	mutex_unlock(&writelock);
-}
-EXPORT_SYMBOL_GPL(n_tracesink_datadrain);
-
-/*
- * Flush buffer is not impelemented as the ldisc has no internal buffering
- * so the tty_driver_flush_buffer() is sufficient for this driver's needs.
- */
-
-/*
- * tty_ldisc function operations for this driver.
- */
-static struct tty_ldisc_ops tty_n_tracesink = {
-	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
-	.name		= DRIVERNAME,
-	.open		= n_tracesink_open,
-	.close		= n_tracesink_close,
-	.read		= n_tracesink_read,
-	.write		= n_tracesink_write
-};
-
-/**
- * n_tracesink_init-	module initialisation
- *
- * Registers this module as a line discipline driver.
- *
- * Return:
- *	0 for success, any other value error.
- */
-static int __init n_tracesink_init(void)
-{
-	/* Note N_TRACESINK is defined in linux/tty.h */
-	int retval = tty_register_ldisc(N_TRACESINK, &tty_n_tracesink);
-
-	if (retval < 0)
-		pr_err("%s: Registration failed: %d\n", __func__, retval);
-
-	return retval;
-}
-
-/**
- * n_tracesink_exit -	module unload
- *
- * Removes this module as a line discipline driver.
- */
-static void __exit n_tracesink_exit(void)
-{
-	int retval = tty_unregister_ldisc(N_TRACESINK);
-
-	if (retval < 0)
-		pr_err("%s: Unregistration failed: %d\n", __func__,  retval);
-}
-
-module_init(n_tracesink_init);
-module_exit(n_tracesink_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jay Freyensee");
-MODULE_ALIAS_LDISC(N_TRACESINK);
-MODULE_DESCRIPTION("Trace sink ldisc driver");
diff --git a/drivers/tty/n_tracesink.h b/drivers/tty/n_tracesink.h
deleted file mode 100644
index 7031d515a700..000000000000
--- a/drivers/tty/n_tracesink.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  n_tracesink.h - Kernel driver API to route trace data in kernel space.
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- *
- * This header file is used by n_tracerouter to be able to send the
- * data of it's tty port to the tty port this module sits.  This
- * mechanism can also be used independent of the PTI module.
- *
- */
-
-#ifndef N_TRACESINK_H_
-#define N_TRACESINK_H_
-
-void n_tracesink_datadrain(u8 *buf, int count);
-
-#endif
diff --git a/include/linux/intel-pti.h b/include/linux/intel-pti.h
deleted file mode 100644
index fcd841a90f2f..000000000000
--- a/include/linux/intel-pti.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- *
- * This header file will allow other parts of the OS to use the
- * interface to write out it's contents for debugging a mobile system.
- */
-
-#ifndef LINUX_INTEL_PTI_H_
-#define LINUX_INTEL_PTI_H_
-
-/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
-#define PTI_LASTDWORD_DTS	0x30
-
-/* basic structure used as a write address to the PTI HW */
-struct pti_masterchannel {
-	u8 master;
-	u8 channel;
-};
-
-/* the following functions are defined in misc/pti.c */
-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
-struct pti_masterchannel *pti_request_masterchannel(u8 type,
-						    const char *thread_name);
-void pti_release_masterchannel(struct pti_masterchannel *mc);
-
-#endif /* LINUX_INTEL_PTI_H_ */
-- 
cgit v1.2.3


From 8544717cdacc2f33f0f53a3b34c5125b37e13ce9 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 14 Jan 2021 19:07:48 +0200
Subject: bus: fsl-mc: move fsl_mc_command struct in a uapi header

Define "struct fsl_mc_command" as a structure that can cross the
user/kernel boundary.

Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Link: https://lore.kernel.org/r/20210114170752.2927915-2-ciorneiioana@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                 |  1 +
 include/linux/fsl/mc.h      |  8 +-------
 include/uapi/linux/fsl_mc.h | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/fsl_mc.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 23273a59da6f..128455811980 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14652,6 +14652,7 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
 F:	Documentation/networking/device_drivers/ethernet/freescale/dpaa2/overview.rst
 F:	drivers/bus/fsl-mc/
+F:	include/uapi/linux/fsl_mc.h
 
 QT1010 MEDIA DRIVER
 M:	Antti Palosaari <crope@iki.fi>
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index db244874e834..63b56aba925a 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/interrupt.h>
+#include <uapi/linux/fsl_mc.h>
 
 #define FSL_MC_VENDOR_FREESCALE	0x1957
 
@@ -209,8 +210,6 @@ struct fsl_mc_device {
 #define to_fsl_mc_device(_dev) \
 	container_of(_dev, struct fsl_mc_device, dev)
 
-#define MC_CMD_NUM_OF_PARAMS	7
-
 struct mc_cmd_header {
 	u8 src_id;
 	u8 flags_hw;
@@ -220,11 +219,6 @@ struct mc_cmd_header {
 	__le16 cmd_id;
 };
 
-struct fsl_mc_command {
-	__le64 header;
-	__le64 params[MC_CMD_NUM_OF_PARAMS];
-};
-
 enum mc_cmd_status {
 	MC_CMD_STATUS_OK = 0x0, /* Completed successfully */
 	MC_CMD_STATUS_READY = 0x1, /* Ready to be processed */
diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h
new file mode 100644
index 000000000000..cf56d46f052e
--- /dev/null
+++ b/include/uapi/linux/fsl_mc.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Management Complex (MC) userspace public interface
+ *
+ * Copyright 2021 NXP
+ *
+ */
+#ifndef _UAPI_FSL_MC_H_
+#define _UAPI_FSL_MC_H_
+
+#include <linux/types.h>
+
+#define MC_CMD_NUM_OF_PARAMS	7
+
+/**
+ * struct fsl_mc_command - Management Complex (MC) command structure
+ * @header: MC command header
+ * @params: MC command parameters
+ */
+struct fsl_mc_command {
+	__le64 header;
+	__le64 params[MC_CMD_NUM_OF_PARAMS];
+};
+
+#endif /* _UAPI_FSL_MC_H_ */
-- 
cgit v1.2.3


From 2cf1e703f066cfa82eb5a358ae84c29fe15a3b3a Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 14 Jan 2021 19:07:50 +0200
Subject: bus: fsl-mc: add fsl-mc userspace support

Adding userspace support for the MC (Management Complex) means exporting
an ioctl capable device file representing the root resource container.

This new functionality in the fsl-mc bus driver intends to provide
userspace applications an interface to interact with the MC firmware.

Commands that are composed in userspace are sent to the MC firmware
through the FSL_MC_SEND_MC_COMMAND ioctl.  By default the implicit MC
I/O portal is used for this operation, but if the implicit one is busy,
a dynamic portal is allocated and then freed upon execution.

The command received through the ioctl interface is checked against a
known whitelist of accepted MC commands. Commands that attempt a change
in hardware configuration will need CAP_NET_ADMIN, while commands used
in debugging do not need it.

Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Link: https://lore.kernel.org/r/20210114170752.2927915-4-ciorneiioana@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   1 +
 drivers/bus/fsl-mc/Kconfig                         |   7 +
 drivers/bus/fsl-mc/Makefile                        |   3 +
 drivers/bus/fsl-mc/dprc-driver.c                   |  12 +
 drivers/bus/fsl-mc/fsl-mc-private.h                |  39 ++
 drivers/bus/fsl-mc/fsl-mc-uapi.c                   | 547 +++++++++++++++++++++
 include/uapi/linux/fsl_mc.h                        |   9 +
 7 files changed, 618 insertions(+)
 create mode 100644 drivers/bus/fsl-mc/fsl-mc-uapi.c

(limited to 'include')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a4c75a28c839..19d730f9d136 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -180,6 +180,7 @@ Code  Seq#    Include File                                           Comments
 'R'   00-1F  linux/random.h                                          conflict!
 'R'   01     linux/rfkill.h                                          conflict!
 'R'   C0-DF  net/bluetooth/rfcomm.h
+'R'   E0     uapi/linux/fsl_mc.h
 'S'   all    linux/cdrom.h                                           conflict!
 'S'   80-81  scsi/scsi_ioctl.h                                       conflict!
 'S'   82-FF  scsi/scsi.h                                             conflict!
diff --git a/drivers/bus/fsl-mc/Kconfig b/drivers/bus/fsl-mc/Kconfig
index c23c77c9b705..b1fd55901c50 100644
--- a/drivers/bus/fsl-mc/Kconfig
+++ b/drivers/bus/fsl-mc/Kconfig
@@ -14,3 +14,10 @@ config FSL_MC_BUS
 	  architecture.  The fsl-mc bus driver handles discovery of
 	  DPAA2 objects (which are represented as Linux devices) and
 	  binding objects to drivers.
+
+config FSL_MC_UAPI_SUPPORT
+	bool "Management Complex (MC) userspace support"
+	depends on FSL_MC_BUS
+	help
+	  Provides userspace support for interrogating, creating, destroying or
+	  configuring DPAA2 objects exported by the Management Complex.
diff --git a/drivers/bus/fsl-mc/Makefile b/drivers/bus/fsl-mc/Makefile
index 3c518c7e8374..4ae292a30e53 100644
--- a/drivers/bus/fsl-mc/Makefile
+++ b/drivers/bus/fsl-mc/Makefile
@@ -16,3 +16,6 @@ mc-bus-driver-objs := fsl-mc-bus.o \
 		      fsl-mc-allocator.o \
 		      fsl-mc-msi.o \
 		      dpmcp.o
+
+# MC userspace support
+obj-$(CONFIG_FSL_MC_UAPI_SUPPORT) += fsl-mc-uapi.o
diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c
index 68488a7ad0d6..ccec375095f2 100644
--- a/drivers/bus/fsl-mc/dprc-driver.c
+++ b/drivers/bus/fsl-mc/dprc-driver.c
@@ -603,6 +603,7 @@ int dprc_setup(struct fsl_mc_device *mc_dev)
 	struct irq_domain *mc_msi_domain;
 	bool mc_io_created = false;
 	bool msi_domain_set = false;
+	bool uapi_created = false;
 	u16 major_ver, minor_ver;
 	size_t region_size;
 	int error;
@@ -635,6 +636,11 @@ int dprc_setup(struct fsl_mc_device *mc_dev)
 			return error;
 
 		mc_io_created = true;
+	} else {
+		error = fsl_mc_uapi_create_device_file(mc_bus);
+		if (error < 0)
+			return -EPROBE_DEFER;
+		uapi_created = true;
 	}
 
 	mc_msi_domain = fsl_mc_find_msi_domain(&mc_dev->dev);
@@ -692,6 +698,9 @@ error_cleanup_msi_domain:
 		mc_dev->mc_io = NULL;
 	}
 
+	if (uapi_created)
+		fsl_mc_uapi_remove_device_file(mc_bus);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(dprc_setup);
@@ -763,6 +772,7 @@ static void dprc_teardown_irq(struct fsl_mc_device *mc_dev)
 
 int dprc_cleanup(struct fsl_mc_device *mc_dev)
 {
+	struct fsl_mc_bus *mc_bus = to_fsl_mc_bus(mc_dev);
 	int error;
 
 	/* this function should be called only for DPRCs, it
@@ -793,6 +803,8 @@ int dprc_cleanup(struct fsl_mc_device *mc_dev)
 	if (!fsl_mc_is_root_dprc(&mc_dev->dev)) {
 		fsl_destroy_mc_io(mc_dev->mc_io);
 		mc_dev->mc_io = NULL;
+	} else {
+		fsl_mc_uapi_remove_device_file(mc_bus);
 	}
 
 	return 0;
diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h
index be686c363121..6293a24de456 100644
--- a/drivers/bus/fsl-mc/fsl-mc-private.h
+++ b/drivers/bus/fsl-mc/fsl-mc-private.h
@@ -10,6 +10,8 @@
 
 #include <linux/fsl/mc.h>
 #include <linux/mutex.h>
+#include <linux/ioctl.h>
+#include <linux/miscdevice.h>
 
 /*
  * Data Path Management Complex (DPMNG) General API
@@ -542,6 +544,22 @@ struct fsl_mc_resource_pool {
 	struct fsl_mc_bus *mc_bus;
 };
 
+/**
+ * struct fsl_mc_uapi - information associated with a device file
+ * @misc: struct miscdevice linked to the root dprc
+ * @device: newly created device in /dev
+ * @mutex: mutex lock to serialize the open/release operations
+ * @local_instance_in_use: local MC I/O instance in use or not
+ * @static_mc_io: pointer to the static MC I/O object
+ */
+struct fsl_mc_uapi {
+	struct miscdevice misc;
+	struct device *device;
+	struct mutex mutex; /* serialize open/release operations */
+	u32 local_instance_in_use;
+	struct fsl_mc_io *static_mc_io;
+};
+
 /**
  * struct fsl_mc_bus - logical bus that corresponds to a physical DPRC
  * @mc_dev: fsl-mc device for the bus device itself.
@@ -551,6 +569,7 @@ struct fsl_mc_resource_pool {
  * @irq_resources: Pointer to array of IRQ objects for the IRQ pool
  * @scan_mutex: Serializes bus scanning
  * @dprc_attr: DPRC attributes
+ * @uapi_misc: struct that abstracts the interaction with userspace
  */
 struct fsl_mc_bus {
 	struct fsl_mc_device mc_dev;
@@ -558,6 +577,7 @@ struct fsl_mc_bus {
 	struct fsl_mc_device_irq *irq_resources;
 	struct mutex scan_mutex;    /* serializes bus scanning */
 	struct dprc_attributes dprc_attr;
+	struct fsl_mc_uapi uapi_misc;
 };
 
 #define to_fsl_mc_bus(_mc_dev) \
@@ -614,4 +634,23 @@ struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc,
 
 u16 mc_cmd_hdr_read_cmdid(struct fsl_mc_command *cmd);
 
+#ifdef CONFIG_FSL_MC_UAPI_SUPPORT
+
+int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus);
+
+void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus);
+
+#else
+
+static inline int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus)
+{
+	return 0;
+}
+
+static inline void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus)
+{
+}
+
+#endif
+
 #endif /* _FSL_MC_PRIVATE_H_ */
diff --git a/drivers/bus/fsl-mc/fsl-mc-uapi.c b/drivers/bus/fsl-mc/fsl-mc-uapi.c
new file mode 100644
index 000000000000..eeb988c9f4bb
--- /dev/null
+++ b/drivers/bus/fsl-mc/fsl-mc-uapi.c
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Management Complex (MC) userspace support
+ *
+ * Copyright 2021 NXP
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+
+#include "fsl-mc-private.h"
+
+struct uapi_priv_data {
+	struct fsl_mc_uapi *uapi;
+	struct fsl_mc_io *mc_io;
+};
+
+struct fsl_mc_cmd_desc {
+	u16 cmdid_value;
+	u16 cmdid_mask;
+	int size;
+	bool token;
+	int flags;
+};
+
+#define FSL_MC_CHECK_MODULE_ID		BIT(0)
+#define FSL_MC_CAP_NET_ADMIN_NEEDED	BIT(1)
+
+enum fsl_mc_cmd_index {
+	DPDBG_DUMP = 0,
+	DPDBG_SET,
+	DPRC_GET_CONTAINER_ID,
+	DPRC_CREATE_CONT,
+	DPRC_DESTROY_CONT,
+	DPRC_ASSIGN,
+	DPRC_UNASSIGN,
+	DPRC_GET_OBJ_COUNT,
+	DPRC_GET_OBJ,
+	DPRC_GET_RES_COUNT,
+	DPRC_GET_RES_IDS,
+	DPRC_SET_OBJ_LABEL,
+	DPRC_SET_LOCKED,
+	DPRC_CONNECT,
+	DPRC_DISCONNECT,
+	DPRC_GET_POOL,
+	DPRC_GET_POOL_COUNT,
+	DPRC_GET_CONNECTION,
+	DPCI_GET_LINK_STATE,
+	DPCI_GET_PEER_ATTR,
+	DPAIOP_GET_SL_VERSION,
+	DPAIOP_GET_STATE,
+	DPMNG_GET_VERSION,
+	DPSECI_GET_TX_QUEUE,
+	DPMAC_GET_COUNTER,
+	DPMAC_GET_MAC_ADDR,
+	DPNI_SET_PRIM_MAC,
+	DPNI_GET_PRIM_MAC,
+	DPNI_GET_STATISTICS,
+	DPNI_GET_LINK_STATE,
+	GET_ATTR,
+	GET_IRQ_MASK,
+	GET_IRQ_STATUS,
+	CLOSE,
+	OPEN,
+	GET_API_VERSION,
+	DESTROY,
+	CREATE,
+};
+
+static struct fsl_mc_cmd_desc fsl_mc_accepted_cmds[] = {
+	[DPDBG_DUMP] = {
+		.cmdid_value = 0x1300,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 28,
+	},
+	[DPDBG_SET] = {
+		.cmdid_value = 0x1400,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 28,
+	},
+	[DPRC_GET_CONTAINER_ID] = {
+		.cmdid_value = 0x8300,
+		.cmdid_mask = 0xFFF0,
+		.token = false,
+		.size = 8,
+	},
+	[DPRC_CREATE_CONT] = {
+		.cmdid_value = 0x1510,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_DESTROY_CONT] = {
+		.cmdid_value = 0x1520,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_ASSIGN] = {
+		.cmdid_value = 0x1570,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_UNASSIGN] = {
+		.cmdid_value = 0x1580,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_GET_OBJ_COUNT] = {
+		.cmdid_value = 0x1590,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+	},
+	[DPRC_GET_OBJ] = {
+		.cmdid_value = 0x15A0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+	},
+	[DPRC_GET_RES_COUNT] = {
+		.cmdid_value = 0x15B0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+	},
+	[DPRC_GET_RES_IDS] = {
+		.cmdid_value = 0x15C0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 40,
+	},
+	[DPRC_SET_OBJ_LABEL] = {
+		.cmdid_value = 0x1610,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 48,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_SET_LOCKED] = {
+		.cmdid_value = 0x16B0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_CONNECT] = {
+		.cmdid_value = 0x1670,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 56,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_DISCONNECT] = {
+		.cmdid_value = 0x1680,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPRC_GET_POOL] = {
+		.cmdid_value = 0x1690,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 12,
+	},
+	[DPRC_GET_POOL_COUNT] = {
+		.cmdid_value = 0x16A0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPRC_GET_CONNECTION] = {
+		.cmdid_value = 0x16C0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 32,
+	},
+
+	[DPCI_GET_LINK_STATE] = {
+		.cmdid_value = 0x0E10,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPCI_GET_PEER_ATTR] = {
+		.cmdid_value = 0x0E20,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPAIOP_GET_SL_VERSION] = {
+		.cmdid_value = 0x2820,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPAIOP_GET_STATE] = {
+		.cmdid_value = 0x2830,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPMNG_GET_VERSION] = {
+		.cmdid_value = 0x8310,
+		.cmdid_mask = 0xFFF0,
+		.token = false,
+		.size = 8,
+	},
+	[DPSECI_GET_TX_QUEUE] = {
+		.cmdid_value = 0x1970,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 14,
+	},
+	[DPMAC_GET_COUNTER] = {
+		.cmdid_value = 0x0c40,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 9,
+	},
+	[DPMAC_GET_MAC_ADDR] = {
+		.cmdid_value = 0x0c50,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPNI_SET_PRIM_MAC] = {
+		.cmdid_value = 0x2240,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 16,
+		.flags = FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[DPNI_GET_PRIM_MAC] = {
+		.cmdid_value = 0x2250,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[DPNI_GET_STATISTICS] = {
+		.cmdid_value = 0x25D0,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 10,
+	},
+	[DPNI_GET_LINK_STATE] = {
+		.cmdid_value = 0x2150,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[GET_ATTR] = {
+		.cmdid_value = 0x0040,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+	[GET_IRQ_MASK] = {
+		.cmdid_value = 0x0150,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 13,
+	},
+	[GET_IRQ_STATUS] = {
+		.cmdid_value = 0x0160,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 13,
+	},
+	[CLOSE] = {
+		.cmdid_value = 0x8000,
+		.cmdid_mask = 0xFFF0,
+		.token = true,
+		.size = 8,
+	},
+
+	/* Common commands amongst all types of objects. Must be checked last. */
+	[OPEN] = {
+		.cmdid_value = 0x8000,
+		.cmdid_mask = 0xFC00,
+		.token = false,
+		.size = 12,
+		.flags = FSL_MC_CHECK_MODULE_ID,
+	},
+	[GET_API_VERSION] = {
+		.cmdid_value = 0xA000,
+		.cmdid_mask = 0xFC00,
+		.token = false,
+		.size = 8,
+		.flags = FSL_MC_CHECK_MODULE_ID,
+	},
+	[DESTROY] = {
+		.cmdid_value = 0x9800,
+		.cmdid_mask = 0xFC00,
+		.token = true,
+		.size = 12,
+		.flags = FSL_MC_CHECK_MODULE_ID | FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+	[CREATE] = {
+		.cmdid_value = 0x9000,
+		.cmdid_mask = 0xFC00,
+		.token = true,
+		.size = 64,
+		.flags = FSL_MC_CHECK_MODULE_ID | FSL_MC_CAP_NET_ADMIN_NEEDED,
+	},
+};
+
+#define FSL_MC_NUM_ACCEPTED_CMDS ARRAY_SIZE(fsl_mc_accepted_cmds)
+
+#define FSL_MC_MAX_MODULE_ID 0x10
+
+static int fsl_mc_command_check(struct fsl_mc_device *mc_dev,
+				struct fsl_mc_command *mc_cmd)
+{
+	struct fsl_mc_cmd_desc *desc = NULL;
+	int mc_cmd_max_size, i;
+	bool token_provided;
+	u16 cmdid, module_id;
+	char *mc_cmd_end;
+	char sum = 0;
+
+	/* Check if this is an accepted MC command */
+	cmdid = mc_cmd_hdr_read_cmdid(mc_cmd);
+	for (i = 0; i < FSL_MC_NUM_ACCEPTED_CMDS; i++) {
+		desc = &fsl_mc_accepted_cmds[i];
+		if ((cmdid & desc->cmdid_mask) == desc->cmdid_value)
+			break;
+	}
+	if (!desc) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: cmdid not accepted\n", cmdid);
+		return -EACCES;
+	}
+
+	/* Check if the size of the command is honored. Anything beyond the
+	 * last valid byte of the command should be zeroed.
+	 */
+	mc_cmd_max_size = sizeof(*mc_cmd);
+	mc_cmd_end = ((char *)mc_cmd) + desc->size;
+	for (i = desc->size; i < mc_cmd_max_size; i++)
+		sum |= *mc_cmd_end++;
+	if (sum) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: garbage beyond max size of %d bytes!\n",
+			cmdid, desc->size);
+		return -EACCES;
+	}
+
+	/* Some MC commands request a token to be passed so that object
+	 * identification is possible. Check if the token passed in the command
+	 * is as expected.
+	 */
+	token_provided = mc_cmd_hdr_read_token(mc_cmd) ? true : false;
+	if (token_provided != desc->token) {
+		dev_err(&mc_dev->dev, "MC command 0x%04x: token 0x%04x is invalid!\n",
+			cmdid, mc_cmd_hdr_read_token(mc_cmd));
+		return -EACCES;
+	}
+
+	/* If needed, check if the module ID passed is valid */
+	if (desc->flags & FSL_MC_CHECK_MODULE_ID) {
+		/* The module ID is represented by bits [4:9] from the cmdid */
+		module_id = (cmdid & GENMASK(9, 4)) >> 4;
+		if (module_id == 0 || module_id > FSL_MC_MAX_MODULE_ID) {
+			dev_err(&mc_dev->dev, "MC command 0x%04x: unknown module ID 0x%x\n",
+				cmdid, module_id);
+			return -EACCES;
+		}
+	}
+
+	/* Some commands alter how hardware resources are managed. For these
+	 * commands, check for CAP_NET_ADMIN.
+	 */
+	if (desc->flags & FSL_MC_CAP_NET_ADMIN_NEEDED) {
+		if (!capable(CAP_NET_ADMIN)) {
+			dev_err(&mc_dev->dev, "MC command 0x%04x: needs CAP_NET_ADMIN!\n",
+				cmdid);
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+static int fsl_mc_uapi_send_command(struct fsl_mc_device *mc_dev, unsigned long arg,
+				    struct fsl_mc_io *mc_io)
+{
+	struct fsl_mc_command mc_cmd;
+	int error;
+
+	error = copy_from_user(&mc_cmd, (void __user *)arg, sizeof(mc_cmd));
+	if (error)
+		return -EFAULT;
+
+	error = fsl_mc_command_check(mc_dev, &mc_cmd);
+	if (error)
+		return error;
+
+	error = mc_send_command(mc_io, &mc_cmd);
+	if (error)
+		return error;
+
+	error = copy_to_user((void __user *)arg, &mc_cmd, sizeof(mc_cmd));
+	if (error)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int fsl_mc_uapi_dev_open(struct inode *inode, struct file *filep)
+{
+	struct fsl_mc_device *root_mc_device;
+	struct uapi_priv_data *priv_data;
+	struct fsl_mc_io *dynamic_mc_io;
+	struct fsl_mc_uapi *mc_uapi;
+	struct fsl_mc_bus *mc_bus;
+	int error;
+
+	priv_data = kzalloc(sizeof(*priv_data), GFP_KERNEL);
+	if (!priv_data)
+		return -ENOMEM;
+
+	mc_uapi = container_of(filep->private_data, struct fsl_mc_uapi, misc);
+	mc_bus = container_of(mc_uapi, struct fsl_mc_bus, uapi_misc);
+	root_mc_device = &mc_bus->mc_dev;
+
+	mutex_lock(&mc_uapi->mutex);
+
+	if (!mc_uapi->local_instance_in_use) {
+		priv_data->mc_io = mc_uapi->static_mc_io;
+		mc_uapi->local_instance_in_use = 1;
+	} else {
+		error = fsl_mc_portal_allocate(root_mc_device, 0,
+					       &dynamic_mc_io);
+		if (error) {
+			dev_dbg(&root_mc_device->dev,
+				"Could not allocate MC portal\n");
+			goto error_portal_allocate;
+		}
+
+		priv_data->mc_io = dynamic_mc_io;
+	}
+	priv_data->uapi = mc_uapi;
+	filep->private_data = priv_data;
+
+	mutex_unlock(&mc_uapi->mutex);
+
+	return 0;
+
+error_portal_allocate:
+	mutex_unlock(&mc_uapi->mutex);
+	kfree(priv_data);
+
+	return error;
+}
+
+static int fsl_mc_uapi_dev_release(struct inode *inode, struct file *filep)
+{
+	struct uapi_priv_data *priv_data;
+	struct fsl_mc_uapi *mc_uapi;
+	struct fsl_mc_io *mc_io;
+
+	priv_data = filep->private_data;
+	mc_uapi = priv_data->uapi;
+	mc_io = priv_data->mc_io;
+
+	mutex_lock(&mc_uapi->mutex);
+
+	if (mc_io == mc_uapi->static_mc_io)
+		mc_uapi->local_instance_in_use = 0;
+	else
+		fsl_mc_portal_free(mc_io);
+
+	kfree(filep->private_data);
+	filep->private_data =  NULL;
+
+	mutex_unlock(&mc_uapi->mutex);
+
+	return 0;
+}
+
+static long fsl_mc_uapi_dev_ioctl(struct file *file,
+				  unsigned int cmd,
+				  unsigned long arg)
+{
+	struct uapi_priv_data *priv_data = file->private_data;
+	struct fsl_mc_device *root_mc_device;
+	struct fsl_mc_bus *mc_bus;
+	int error;
+
+	mc_bus = container_of(priv_data->uapi, struct fsl_mc_bus, uapi_misc);
+	root_mc_device = &mc_bus->mc_dev;
+
+	switch (cmd) {
+	case FSL_MC_SEND_MC_COMMAND:
+		error = fsl_mc_uapi_send_command(root_mc_device, arg, priv_data->mc_io);
+		break;
+	default:
+		dev_dbg(&root_mc_device->dev, "unexpected ioctl call number\n");
+		error = -EINVAL;
+	}
+
+	return error;
+}
+
+static const struct file_operations fsl_mc_uapi_dev_fops = {
+	.owner = THIS_MODULE,
+	.open = fsl_mc_uapi_dev_open,
+	.release = fsl_mc_uapi_dev_release,
+	.unlocked_ioctl = fsl_mc_uapi_dev_ioctl,
+};
+
+int fsl_mc_uapi_create_device_file(struct fsl_mc_bus *mc_bus)
+{
+	struct fsl_mc_device *mc_dev = &mc_bus->mc_dev;
+	struct fsl_mc_uapi *mc_uapi = &mc_bus->uapi_misc;
+	int error;
+
+	mc_uapi->misc.minor = MISC_DYNAMIC_MINOR;
+	mc_uapi->misc.name = dev_name(&mc_dev->dev);
+	mc_uapi->misc.fops = &fsl_mc_uapi_dev_fops;
+
+	error = misc_register(&mc_uapi->misc);
+	if (error)
+		return error;
+
+	mc_uapi->static_mc_io = mc_bus->mc_dev.mc_io;
+
+	mutex_init(&mc_uapi->mutex);
+
+	return 0;
+}
+
+void fsl_mc_uapi_remove_device_file(struct fsl_mc_bus *mc_bus)
+{
+	misc_deregister(&mc_bus->uapi_misc.misc);
+}
diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h
index cf56d46f052e..e57451570033 100644
--- a/include/uapi/linux/fsl_mc.h
+++ b/include/uapi/linux/fsl_mc.h
@@ -16,10 +16,19 @@
  * struct fsl_mc_command - Management Complex (MC) command structure
  * @header: MC command header
  * @params: MC command parameters
+ *
+ * Used by FSL_MC_SEND_MC_COMMAND
  */
 struct fsl_mc_command {
 	__le64 header;
 	__le64 params[MC_CMD_NUM_OF_PARAMS];
 };
 
+#define FSL_MC_SEND_CMD_IOCTL_TYPE	'R'
+#define FSL_MC_SEND_CMD_IOCTL_SEQ	0xE0
+
+#define FSL_MC_SEND_MC_COMMAND \
+	_IOWR(FSL_MC_SEND_CMD_IOCTL_TYPE, FSL_MC_SEND_CMD_IOCTL_SEQ, \
+	struct fsl_mc_command)
+
 #endif /* _UAPI_FSL_MC_H_ */
-- 
cgit v1.2.3


From 1423de718e6a0ce00372ab119fa40060ff50883e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 2 Jun 2020 15:56:55 -0500
Subject: PCI/ACPI: Make acpi_pci_osc_control_set() static

acpi_pci_osc_control_set() is only called from pci_root.c, so stop
exporting it and make it static.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/pci_root.c | 3 +--
 include/linux/acpi.h    | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 0bf072cef6cf..7d5a4bd4207c 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_dev);
  * _OSC bits the BIOS has granted control of, but its contents are meaningless
  * on failure.
  **/
-acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req)
+static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req)
 {
 	struct acpi_pci_root *root;
 	acpi_status status = AE_OK;
@@ -406,7 +406,6 @@ out:
 	mutex_unlock(&osc_lock);
 	return status;
 }
-EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
 static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 				 bool is_pcie)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 053bf05fb1f7..4703daafcce9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -581,9 +581,6 @@ extern bool osc_pc_lpi_support_confirmed;
 #define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES	0x0000000E
 #define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS	0x0000000F
 
-extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
-					     u32 *mask, u32 req);
-
 /* Enable _OST when all relevant hotplug operations are enabled */
 #if defined(CONFIG_ACPI_HOTPLUG_CPU) &&			\
 	defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&		\
-- 
cgit v1.2.3


From c209e742141bc0064ecdb73ef0dd825310de28cb Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Thu, 3 Dec 2020 17:12:09 +0200
Subject: habanalabs: allow user to pass a staged submission seq

In order to support the staged submission feature, user must be
allowed to use the same CS sequence for all submissions in the
same staged submission.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../misc/habanalabs/common/command_submission.c    | 16 +++++++++----
 include/uapi/misc/habanalabs.h                     | 28 +++++++++++++++-------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ccb4972a555f..348b0ecb4489 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -807,7 +807,7 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
 }
 
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
-				u32 num_chunks, u64 *cs_seq, bool timestamp)
+				u32 num_chunks, u64 *cs_seq, u32 flags)
 {
 	bool int_queues_only = true;
 	struct hl_device *hdev = hpriv->hdev;
@@ -836,7 +836,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_chunk_array;
 	}
 
-	cs->timestamp = !!timestamp;
+	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
 	*cs_seq = cs->sequence;
 
 	hl_debugfs_add_cs(cs);
@@ -1004,7 +1004,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 			rc = 0;
 		} else {
 			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
-						cs_seq, false);
+								cs_seq, 0);
 		}
 
 		mutex_unlock(&hpriv->restore_phase_mutex);
@@ -1347,7 +1347,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	enum hl_cs_type cs_type;
 	u64 cs_seq = ULONG_MAX;
 	void __user *chunks;
-	u32 num_chunks;
+	u32 num_chunks, flags;
 	int rc;
 
 	rc = hl_cs_sanity_checks(hpriv, args);
@@ -1362,6 +1362,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 					~HL_CS_FLAGS_FORCE_RESTORE);
 	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
 	num_chunks = args->in.num_chunks_execute;
+	flags = args->in.cs_flags;
+
+	/* In case this is a staged CS, user should supply the CS sequence */
+	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
+			!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
+		cs_seq = args->in.seq;
 
 	switch (cs_type) {
 	case CS_TYPE_SIGNAL:
@@ -1372,7 +1378,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		break;
 	default:
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
-				args->in.cs_flags & HL_CS_FLAGS_TIMESTAMP);
+							args->in.cs_flags);
 		break;
 	}
 
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index dba3827c43ca..b9afe1cdac24 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -604,11 +604,14 @@ struct hl_cs_chunk {
 };
 
 /* SIGNAL and WAIT/COLLECTIVE_WAIT flags are mutually exclusive */
-#define HL_CS_FLAGS_FORCE_RESTORE	0x1
-#define HL_CS_FLAGS_SIGNAL		0x2
-#define HL_CS_FLAGS_WAIT		0x4
-#define HL_CS_FLAGS_COLLECTIVE_WAIT	0x8
-#define HL_CS_FLAGS_TIMESTAMP		0x20
+#define HL_CS_FLAGS_FORCE_RESTORE		0x1
+#define HL_CS_FLAGS_SIGNAL			0x2
+#define HL_CS_FLAGS_WAIT			0x4
+#define HL_CS_FLAGS_COLLECTIVE_WAIT		0x8
+#define HL_CS_FLAGS_TIMESTAMP			0x20
+#define HL_CS_FLAGS_STAGED_SUBMISSION		0x40
+#define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST	0x80
+#define HL_CS_FLAGS_STAGED_SUBMISSION_LAST	0x100
 
 #define HL_CS_STATUS_SUCCESS		0
 
@@ -622,10 +625,17 @@ struct hl_cs_in {
 	/* holds address of array of hl_cs_chunk for execution phase */
 	__u64 chunks_execute;
 
-	/* this holds address of array of hl_cs_chunk for store phase -
-	 * Currently not in use
-	 */
-	__u64 chunks_store;
+	union {
+		/* this holds address of array of hl_cs_chunk for store phase -
+		 * Currently not in use
+		 */
+		__u64 chunks_store;
+
+		/* Sequence number of a staged submission CS
+		 * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
+		 */
+		__u64 seq;
+	};
 
 	/* Number of chunks in restore phase array. Maximum number is
 	 * HL_MAX_JOBS_PER_CS
-- 
cgit v1.2.3


From 0eda23d77e1beede9c61b0cba6d9267d3a92bd4e Mon Sep 17 00:00:00 2001
From: Moti Haimovski <mhaimovski@habana.ai>
Date: Mon, 7 Dec 2020 09:10:34 +0200
Subject: habanalabs: report dram_page_size in hw_ip_info ioctl

Instead of having it hard-coded as a define, pass it to the user
in runtime.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 1 +
 include/uapi/misc/habanalabs.h                    | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index d25892d61ec9..394a2e1767ce 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -63,6 +63,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.dram_size = prop->dram_size - dram_kmd_size;
 	if (hw_ip.dram_size > PAGE_SIZE)
 		hw_ip.dram_enabled = 1;
+	hw_ip.dram_page_size = prop->dram_page_size;
 	hw_ip.num_of_events = prop->num_of_events;
 
 	memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index b9afe1cdac24..b431a70e1b8b 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -320,6 +320,8 @@ struct hl_info_hw_ip_info {
 	__u8 pad[2];
 	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
+	__u64 reserved2;
+	__u64 dram_page_size;
 };
 
 struct hl_info_dram_usage {
-- 
cgit v1.2.3


From e1fa724dd17a6a9b9934636226e683912d12c876 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Wed, 6 Jan 2021 15:40:37 +0200
Subject: habanalabs: add user available interrupt to hw_ip

In order to support completions that arrive directly to the user,
the driver needs to supply the user with the first available msix
interrupt available.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h       | 3 +++
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 2 ++
 drivers/misc/habanalabs/gaudi/gaudi.c             | 2 ++
 drivers/misc/habanalabs/goya/goya.c               | 2 ++
 include/uapi/misc/habanalabs.h                    | 6 ++++--
 5 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 3923b03e99aa..ce1a1e70a6d5 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -408,6 +408,8 @@ struct hl_mmu_properties {
  * @sync_stream_first_mon: first monitor available for sync stream use
  * @first_available_user_sob: first sob available for the user
  * @first_available_user_mon: first monitor available for the user
+ * @first_available_user_msix_interrupt: first available msix interrupt
+ *                                       reserved for the user
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
  * @fw_security_disabled: true if security measures are disabled in firmware,
@@ -469,6 +471,7 @@ struct asic_fixed_properties {
 	u16				sync_stream_first_mon;
 	u16				first_available_user_sob[HL_MAX_DCORES];
 	u16				first_available_user_mon[HL_MAX_DCORES];
+	u16				first_available_user_msix_interrupt;
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 	u8				fw_security_disabled;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index dfac5c8cadb3..628bdc56dca3 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -90,6 +90,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od;
 	hw_ip.psoc_pci_pll_div_factor = prop->psoc_pci_pll_div_factor;
 
+	hw_ip.first_available_interrupt_id =
+			prop->first_available_user_msix_interrupt;
 	return copy_to_user(out, &hw_ip,
 		min((size_t)size, sizeof(hw_ip))) ? -EFAULT : 0;
 }
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 2b01c081404a..69b3867bc151 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -527,6 +527,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 			prop->sync_stream_first_mon +
 			(num_sync_stream_queues * HL_RSVD_MONS);
 
+	prop->first_available_user_msix_interrupt = USHRT_MAX;
+
 	/* disable fw security for now, set it in a later stage */
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 50dcefc02cdd..82f69274def7 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -455,6 +455,8 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 
 	prop->max_pending_cs = GOYA_MAX_PENDING_CS;
 
+	prop->first_available_user_msix_interrupt = USHRT_MAX;
+
 	/* disable fw security for now, set it in a later stage */
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index b431a70e1b8b..866355a53188 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -309,7 +309,9 @@ struct hl_info_hw_ip_info {
 	__u32 num_of_events;
 	__u32 device_id; /* PCI Device ID */
 	__u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
-	__u32 reserved[2];
+	__u32 reserved;
+	__u16 first_available_interrupt_id;
+	__u16 reserved2;
 	__u32 cpld_version;
 	__u32 psoc_pci_pll_nr;
 	__u32 psoc_pci_pll_nf;
@@ -320,7 +322,7 @@ struct hl_info_hw_ip_info {
 	__u8 pad[2];
 	__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
 	__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
-	__u64 reserved2;
+	__u64 reserved3;
 	__u64 dram_page_size;
 };
 
-- 
cgit v1.2.3


From d00697fbe13ccc1dfb7adb82204e1469a7783b07 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Tue, 5 Jan 2021 12:55:06 +0200
Subject: habanalabs: add new mem ioctl op for mapping hw blocks

For future ASIC support the driver allows user to map certain regions
in the device's configuration space for direct access from userspace.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     |  3 +
 drivers/misc/habanalabs/common/habanalabs.h | 16 +++--
 drivers/misc/habanalabs/common/memory.c     | 93 ++++++++++++++++++++++++++++-
 drivers/misc/habanalabs/gaudi/gaudi.c       | 17 +++++-
 drivers/misc/habanalabs/goya/goya.c         | 16 ++++-
 include/uapi/misc/habanalabs.h              | 18 +++++-
 6 files changed, 153 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 13405d47d843..59219c862ca0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -142,6 +142,9 @@ static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
 	case HL_MMAP_TYPE_CB:
 		return hl_cb_mmap(hpriv, vma);
+
+	case HL_MMAP_TYPE_BLOCK:
+		return hl_hw_block_mmap(hpriv, vma);
 	}
 
 	return -EINVAL;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 4129e4e6a7b5..e105612ed577 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -28,17 +28,18 @@
 #define HL_NAME				"habanalabs"
 
 /* Use upper bits of mmap offset to store habana driver specific information.
- * bits[63:62] - Encode mmap type
+ * bits[63:61] - Encode mmap type
  * bits[45:0]  - mmap offset value
  *
  * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
  *  defines are w.r.t to PAGE_SIZE
  */
-#define HL_MMAP_TYPE_SHIFT		(62 - PAGE_SHIFT)
-#define HL_MMAP_TYPE_MASK		(0x3ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_SHIFT		(61 - PAGE_SHIFT)
+#define HL_MMAP_TYPE_MASK		(0x7ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_BLOCK		(0x4ull << HL_MMAP_TYPE_SHIFT)
 #define HL_MMAP_TYPE_CB			(0x2ull << HL_MMAP_TYPE_SHIFT)
 
-#define HL_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFull >> PAGE_SHIFT)
+#define HL_MMAP_OFFSET_VALUE_MASK	(0x1FFFFFFFFFFFull >> PAGE_SHIFT)
 #define HL_MMAP_OFFSET_VALUE_GET(off)	(off & HL_MMAP_OFFSET_VALUE_MASK)
 
 #define HL_PENDING_RESET_PER_SEC	10
@@ -856,6 +857,8 @@ enum div_select_defs {
  * @descramble_addr: Routine to de-scramble the address prior of
  *                  showing it to users.
  * @ack_protection_bits_errors: ack and dump all security violations
+ * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it.
+ * @hw_block_mmap: mmap a HW block with a given id.
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -968,6 +971,10 @@ struct hl_asic_funcs {
 	u64 (*scramble_addr)(struct hl_device *hdev, u64 addr);
 	u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
 	void (*ack_protection_bits_errors)(struct hl_device *hdev);
+	int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr,
+			u32 *block_id);
+	int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+			u32 block_id, u32 block_size);
 };
 
 
@@ -2167,6 +2174,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 			bool map_cb, u64 *handle);
 int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
 int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
 struct hl_cb *hl_cb_get(struct hl_device *hdev,	struct hl_cb_mgr *mgr,
 			u32 handle);
 void hl_cb_put(struct hl_cb *cb);
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 36de0d05d3a2..7171e8820a2d 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -1289,11 +1289,88 @@ vm_type_err:
 	return rc;
 }
 
+static int map_block(struct hl_device *hdev, u64 address, u64 *handle)
+{
+	u32 block_id = 0;
+	int rc;
+
+	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, &block_id);
+
+	*handle = block_id | HL_MMAP_TYPE_BLOCK;
+	*handle <<= PAGE_SHIFT;
+
+	return rc;
+}
+
+static void hw_block_vm_close(struct vm_area_struct *vma)
+{
+	struct hl_ctx *ctx = (struct hl_ctx *) vma->vm_private_data;
+
+	hl_ctx_put(ctx);
+	vma->vm_private_data = NULL;
+}
+
+static const struct vm_operations_struct hw_block_vm_ops = {
+	.close = hw_block_vm_close
+};
+
+/**
+ * hl_hw_block_mmap() - mmap a hw block to user.
+ * @hpriv: pointer to the private data of the fd
+ * @vma: pointer to vm_area_struct of the process
+ *
+ * Driver increments context reference for every HW block mapped in order
+ * to prevent user from closing FD without unmapping first
+ */
+int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 block_id, block_size;
+	int rc;
+
+	/* We use the page offset to hold the block id and thus we need to clear
+	 * it before doing the mmap itself
+	 */
+	block_id = vma->vm_pgoff;
+	vma->vm_pgoff = 0;
+
+	/* Driver only allows mapping of a complete HW block */
+	block_size = vma->vm_end - vma->vm_start;
+
+#ifdef _HAS_TYPE_ARG_IN_ACCESS_OK
+	if (!access_ok(VERIFY_WRITE,
+		(void __user *) (uintptr_t) vma->vm_start, block_size)) {
+#else
+	if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {
+#endif
+		dev_err(hdev->dev,
+			"user pointer is invalid - 0x%lx\n",
+			vma->vm_start);
+
+		return -EINVAL;
+	}
+
+	vma->vm_ops = &hw_block_vm_ops;
+	vma->vm_private_data = hpriv->ctx;
+
+	hl_ctx_get(hdev, hpriv->ctx);
+
+	rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
+	if (rc) {
+		hl_ctx_put(hpriv->ctx);
+		return rc;
+	}
+
+	vma->vm_pgoff = block_id;
+
+	return 0;
+}
+
 static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
-	u64 device_addr = 0;
+	u64 block_handle, device_addr = 0;
 	u32 handle = 0;
 	int rc;
 
@@ -1337,6 +1414,12 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 		rc = 0;
 		break;
 
+	case HL_MEM_OP_MAP_BLOCK:
+		rc = map_block(hdev, args->in.map_block.block_addr,
+							&block_handle);
+		args->out.handle = block_handle;
+		break;
+
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
 		rc = -ENOTTY;
@@ -1353,7 +1436,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 	union hl_mem_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
-	u64 device_addr = 0;
+	u64 block_handle, device_addr = 0;
 	u32 handle = 0;
 	int rc;
 
@@ -1439,6 +1522,12 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = unmap_device_va(ctx, &args->in, false);
 		break;
 
+	case HL_MEM_OP_MAP_BLOCK:
+		rc = map_block(hdev, args->in.map_block.block_addr,
+							&block_handle);
+		args->out.handle = block_handle;
+		break;
+
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
 		rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index b49e10394ed4..e1c6072e5fb3 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8471,6 +8471,19 @@ static u64 gaudi_get_device_time(struct hl_device *hdev)
 	return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL);
 }
 
+static int gaudi_get_hw_block_id(struct hl_device *hdev, u64 block_addr,
+					u32 *block_id)
+{
+	return -EPERM;
+}
+
+static int gaudi_block_mmap(struct hl_device *hdev,
+				struct vm_area_struct *vma,
+				u32 block_id, u32 block_size)
+{
+	return -EPERM;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
 	.early_init = gaudi_early_init,
 	.early_fini = gaudi_early_fini,
@@ -8550,7 +8563,9 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.collective_wait_create_jobs = gaudi_collective_wait_create_jobs,
 	.scramble_addr = hl_mmu_scramble_addr,
 	.descramble_addr = hl_mmu_descramble_addr,
-	.ack_protection_bits_errors = gaudi_ack_protection_bits_errors
+	.ack_protection_bits_errors = gaudi_ack_protection_bits_errors,
+	.get_hw_block_id = gaudi_get_hw_block_id,
+	.hw_block_mmap = gaudi_block_mmap
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index db951d622ad5..6b4c41188495 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5382,6 +5382,18 @@ static void goya_ctx_fini(struct hl_ctx *ctx)
 
 }
 
+static int goya_get_hw_block_id(struct hl_device *hdev, u64 block_addr,
+				u32 *block_id)
+{
+	return -EPERM;
+}
+
+static int goya_block_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+				u32 block_id, u32 block_size)
+{
+	return -EPERM;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
 	.early_init = goya_early_init,
 	.early_fini = goya_early_fini,
@@ -5461,7 +5473,9 @@ static const struct hl_asic_funcs goya_funcs = {
 	.collective_wait_create_jobs = goya_collective_wait_create_jobs,
 	.scramble_addr = hl_mmu_scramble_addr,
 	.descramble_addr = hl_mmu_descramble_addr,
-	.ack_protection_bits_errors = goya_ack_protection_bits_errors
+	.ack_protection_bits_errors = goya_ack_protection_bits_errors,
+	.get_hw_block_id = goya_get_hw_block_id,
+	.hw_block_mmap = goya_block_mmap
 };
 
 /*
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 866355a53188..b1c09eba8ac2 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -718,6 +718,8 @@ union hl_wait_cs_args {
 #define HL_MEM_OP_MAP			2
 /* Opcode to unmap previously mapped host and device memory */
 #define HL_MEM_OP_UNMAP			3
+/* Opcode to map a hw block */
+#define HL_MEM_OP_MAP_BLOCK		4
 
 /* Memory flags */
 #define HL_MEM_CONTIGUOUS	0x1
@@ -772,6 +774,17 @@ struct hl_mem_in {
 			__u64 mem_size;
 		} map_host;
 
+		/* HL_MEM_OP_MAP_BLOCK - map a hw block */
+		struct {
+			/*
+			 * HW block address to map, a handle will be returned
+			 * to the user and will be used to mmap the relevant
+			 * block. Only addresses from configuration space are
+			 * allowed.
+			 */
+			__u64 block_addr;
+		} map_block;
+
 		/* HL_MEM_OP_UNMAP - unmap host memory */
 		struct {
 			/* Virtual address returned from HL_MEM_OP_MAP */
@@ -798,8 +811,9 @@ struct hl_mem_out {
 		__u64 device_virt_addr;
 
 		/*
-		 * Used for HL_MEM_OP_ALLOC. This is the assigned
-		 * handle for the allocated memory
+		 * Used for HL_MEM_OP_ALLOC and HL_MEM_OP_MAP_BLOCK.
+		 * This is the assigned handle for the allocated memory
+		 * or mapped block
 		 */
 		__u64 handle;
 	};
-- 
cgit v1.2.3


From cf30339d3f44a64115e88d46a932fdc3d3644785 Mon Sep 17 00:00:00 2001
From: Ohad Sharabi <osharabi@habana.ai>
Date: Sun, 17 Jan 2021 16:01:56 +0200
Subject: habanalabs: modify device_idle interface

Currently this API uses single 64 bits mask for engines idle indication.
Recently, it was observed that more bits are needed for some ASICs.
This patch modifies the use of the idle mask and the idle_extensions
mask.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/context.c          |  9 +++---
 drivers/misc/habanalabs/common/debugfs.c          |  2 +-
 drivers/misc/habanalabs/common/habanalabs.h       |  4 +--
 drivers/misc/habanalabs/common/habanalabs_ioctl.c |  5 +--
 drivers/misc/habanalabs/gaudi/gaudi.c             | 37 ++++++++++-------------
 drivers/misc/habanalabs/goya/goya.c               | 21 ++++++-------
 include/uapi/misc/habanalabs.h                    |  4 ++-
 7 files changed, 40 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 829fe98eed61..cda871afb8f4 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -12,7 +12,7 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
-	u64 idle_mask = 0;
+	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i;
 
 	/* Release all allocated pending cb's, those cb's were never
@@ -55,10 +55,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 
 		if ((!hdev->pldm) && (hdev->pdev) &&
 				(!hdev->asic_funcs->is_device_idle(hdev,
-							&idle_mask, NULL)))
+					idle_mask,
+					HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)))
 			dev_notice(hdev->dev,
-				"device not idle after user context is closed (0x%llx)\n",
-				idle_mask);
+					"device not idle after user context is closed (0x%llx, 0x%llx)\n",
+						idle_mask[0], idle_mask[1]);
 	} else {
 		dev_dbg(hdev->dev, "closing kernel context\n");
 		hdev->asic_funcs->ctx_fini(ctx);
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 9e3c1efe56ba..df847a6d19f4 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -414,7 +414,7 @@ static int engines_show(struct seq_file *s, void *data)
 		return 0;
 	}
 
-	hdev->asic_funcs->is_device_idle(hdev, NULL, s);
+	hdev->asic_funcs->is_device_idle(hdev, NULL, 0, s);
 
 	return 0;
 }
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index fd2fffd20ba1..be7947d69dfa 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -933,8 +933,8 @@ struct hl_asic_funcs {
 	void (*set_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
 	int (*debug_coresight)(struct hl_device *hdev, void *data);
-	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
-				struct seq_file *s);
+	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
+					u8 mask_len, struct seq_file *s);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 628bdc56dca3..e86f46d4b613 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -145,9 +145,10 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 		return -EINVAL;
 
 	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
-					&hw_idle.busy_engines_mask_ext, NULL);
+					hw_idle.busy_engines_mask_ext,
+					HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
 	hw_idle.busy_engines_mask =
-			lower_32_bits(hw_idle.busy_engines_mask_ext);
+			lower_32_bits(hw_idle.busy_engines_mask_ext[0]);
 
 	return copy_to_user(out, &hw_idle,
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index e1c6072e5fb3..9a3d2fb477a8 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -4538,7 +4538,6 @@ static int gaudi_scrub_device_mem(struct hl_device *hdev, u64 addr, u64 size)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct gaudi_device *gaudi = hdev->asic_specific;
-	u64 idle_mask = 0;
 	int rc = 0;
 	u64 val = 0;
 
@@ -4551,8 +4550,8 @@ static int gaudi_scrub_device_mem(struct hl_device *hdev, u64 addr, u64 size)
 				hdev,
 				mmDMA0_CORE_STS0/* dummy */,
 				val/* dummy */,
-				(hdev->asic_funcs->is_device_idle(hdev,
-						&idle_mask, NULL)),
+				(hdev->asic_funcs->is_device_idle(hdev, NULL,
+						0, NULL)),
 						1000,
 						HBM_SCRUBBING_TIMEOUT_US);
 		if (rc) {
@@ -6423,7 +6422,7 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 	else
 		timeout = HL_DEVICE_TIMEOUT_USEC;
 
-	if (!hdev->asic_funcs->is_device_idle(hdev, NULL, NULL)) {
+	if (!hdev->asic_funcs->is_device_idle(hdev, NULL, 0, NULL)) {
 		dev_err_ratelimited(hdev->dev,
 			"Can't send driver job on QMAN0 because the device is not idle\n");
 		return -EBUSY;
@@ -7706,13 +7705,14 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev)
 	return 0;
 }
 
-static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
-					struct seq_file *s)
+static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
+					u8 mask_len, struct seq_file *s)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 	const char *fmt = "%-5d%-9s%#-14x%#-12x%#x\n";
 	const char *mme_slave_fmt = "%-5d%-9s%-14s%-12s%#x\n";
 	const char *nic_fmt = "%-5d%-9s%#-14x%#x\n";
+	unsigned long *mask = (unsigned long *)mask_arr;
 	u32 qm_glbl_sts0, qm_cgm_sts, dma_core_sts0, tpc_cfg_sts, mme_arch_sts;
 	bool is_idle = true, is_eng_idle, is_slave;
 	u64 offset;
@@ -7738,9 +7738,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 				IS_DMA_IDLE(dma_core_sts0);
 		is_idle &= is_eng_idle;
 
-		if (mask)
-			*mask |= ((u64) !is_eng_idle) <<
-					(GAUDI_ENGINE_ID_DMA_0 + dma_id);
+		if (mask && !is_eng_idle)
+			set_bit(GAUDI_ENGINE_ID_DMA_0 + dma_id, mask);
 		if (s)
 			seq_printf(s, fmt, dma_id,
 				is_eng_idle ? "Y" : "N", qm_glbl_sts0,
@@ -7761,9 +7760,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 				IS_TPC_IDLE(tpc_cfg_sts);
 		is_idle &= is_eng_idle;
 
-		if (mask)
-			*mask |= ((u64) !is_eng_idle) <<
-						(GAUDI_ENGINE_ID_TPC_0 + i);
+		if (mask && !is_eng_idle)
+			set_bit(GAUDI_ENGINE_ID_TPC_0 + i, mask);
 		if (s)
 			seq_printf(s, fmt, i,
 				is_eng_idle ? "Y" : "N",
@@ -7790,9 +7788,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 
 		is_idle &= is_eng_idle;
 
-		if (mask)
-			*mask |= ((u64) !is_eng_idle) <<
-						(GAUDI_ENGINE_ID_MME_0 + i);
+		if (mask && !is_eng_idle)
+			set_bit(GAUDI_ENGINE_ID_MME_0 + i, mask);
 		if (s) {
 			if (!is_slave)
 				seq_printf(s, fmt, i,
@@ -7818,9 +7815,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 			is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
 			is_idle &= is_eng_idle;
 
-			if (mask)
-				*mask |= ((u64) !is_eng_idle) <<
-						(GAUDI_ENGINE_ID_NIC_0 + port);
+			if (mask && !is_eng_idle)
+				set_bit(GAUDI_ENGINE_ID_NIC_0 + port, mask);
 			if (s)
 				seq_printf(s, nic_fmt, port,
 						is_eng_idle ? "Y" : "N",
@@ -7834,9 +7830,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
 			is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts);
 			is_idle &= is_eng_idle;
 
-			if (mask)
-				*mask |= ((u64) !is_eng_idle) <<
-						(GAUDI_ENGINE_ID_NIC_0 + port);
+			if (mask && !is_eng_idle)
+				set_bit(GAUDI_ENGINE_ID_NIC_0 + port, mask);
 			if (s)
 				seq_printf(s, nic_fmt, port,
 						is_eng_idle ? "Y" : "N",
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 6b4c41188495..a954e7c02375 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2916,7 +2916,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
 	else
 		timeout = HL_DEVICE_TIMEOUT_USEC;
 
-	if (!hdev->asic_funcs->is_device_idle(hdev, NULL, NULL)) {
+	if (!hdev->asic_funcs->is_device_idle(hdev, NULL, 0, NULL)) {
 		dev_err_ratelimited(hdev->dev,
 			"Can't send driver job on QMAN0 because the device is not idle\n");
 		return -EBUSY;
@@ -5187,11 +5187,12 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
 	/* clock gating not supported in Goya */
 }
 
-static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
-				struct seq_file *s)
+static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask_arr,
+					u8 mask_len, struct seq_file *s)
 {
 	const char *fmt = "%-5d%-9s%#-14x%#-16x%#x\n";
 	const char *dma_fmt = "%-5d%-9s%#-14x%#x\n";
+	unsigned long *mask = (unsigned long *)mask_arr;
 	u32 qm_glbl_sts0, cmdq_glbl_sts0, dma_core_sts0, tpc_cfg_sts,
 		mme_arch_sts;
 	bool is_idle = true, is_eng_idle;
@@ -5211,9 +5212,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
 				IS_DMA_IDLE(dma_core_sts0);
 		is_idle &= is_eng_idle;
 
-		if (mask)
-			*mask |= ((u64) !is_eng_idle) <<
-						(GOYA_ENGINE_ID_DMA_0 + i);
+		if (mask && !is_eng_idle)
+			set_bit(GOYA_ENGINE_ID_DMA_0 + i, mask);
 		if (s)
 			seq_printf(s, dma_fmt, i, is_eng_idle ? "Y" : "N",
 					qm_glbl_sts0, dma_core_sts0);
@@ -5235,9 +5235,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
 				IS_TPC_IDLE(tpc_cfg_sts);
 		is_idle &= is_eng_idle;
 
-		if (mask)
-			*mask |= ((u64) !is_eng_idle) <<
-						(GOYA_ENGINE_ID_TPC_0 + i);
+		if (mask && !is_eng_idle)
+			set_bit(GOYA_ENGINE_ID_TPC_0 + i, mask);
 		if (s)
 			seq_printf(s, fmt, i, is_eng_idle ? "Y" : "N",
 				qm_glbl_sts0, cmdq_glbl_sts0, tpc_cfg_sts);
@@ -5256,8 +5255,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
 			IS_MME_IDLE(mme_arch_sts);
 	is_idle &= is_eng_idle;
 
-	if (mask)
-		*mask |= ((u64) !is_eng_idle) << GOYA_ENGINE_ID_MME_0;
+	if (mask && !is_eng_idle)
+		set_bit(GOYA_ENGINE_ID_MME_0, mask);
 	if (s) {
 		seq_printf(s, fmt, 0, is_eng_idle ? "Y" : "N", qm_glbl_sts0,
 				cmdq_glbl_sts0, mme_arch_sts);
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index b1c09eba8ac2..ebde42b37b43 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -331,6 +331,8 @@ struct hl_info_dram_usage {
 	__u64 ctx_dram_mem;
 };
 
+#define HL_BUSY_ENGINES_MASK_EXT_SIZE	2
+
 struct hl_info_hw_idle {
 	__u32 is_idle;
 	/*
@@ -343,7 +345,7 @@ struct hl_info_hw_idle {
 	 * Extended Bitmask of busy engines.
 	 * Bits definition is according to `enum <chip>_enging_id'.
 	 */
-	__u64 busy_engines_mask_ext;
+	__u64 busy_engines_mask_ext[HL_BUSY_ENGINES_MASK_EXT_SIZE];
 };
 
 struct hl_info_device_status {
-- 
cgit v1.2.3


From a0f2a1cb65c9d8a66853e1b67184022663950f6d Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 29 Jan 2021 21:31:17 +0200
Subject: dmaengine: ti: k3-psil: optimize struct psil_endpoint_config for size

Optimize struct psil_endpoint_config for size by
- reordering fields
- grouping bitfields
- change mapped_channel_id type to s16 (32K channel is enough)
- default_flow_id type to s16 as it's assigned to -1

before:
text            data     bss    dec	        hex	filename
12654100	5211472	 666904	18532476	11ac87c	vmlinux

after:
12654100	5208528	 666904	18529532	11abcfc	vmlinux

diff: 2944 bytes

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20210129193117.28833-1-grygorii.strashko@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-psil.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h
index 36e22c5a0f29..5f106d852f1c 100644
--- a/include/linux/dma/k3-psil.h
+++ b/include/linux/dma/k3-psil.h
@@ -42,14 +42,14 @@ enum psil_endpoint_type {
 /**
  * struct psil_endpoint_config - PSI-L Endpoint configuration
  * @ep_type:		PSI-L endpoint type
+ * @channel_tpl:	Desired throughput level for the channel
  * @pkt_mode:		If set, the channel must be in Packet mode, otherwise in
  *			TR mode
  * @notdpkt:		TDCM must be suppressed on the TX channel
  * @needs_epib:		Endpoint needs EPIB
- * @psd_size:		If set, PSdata is used by the endpoint
- * @channel_tpl:	Desired throughput level for the channel
  * @pdma_acc32:		ACC32 must be enabled on the PDMA side
  * @pdma_burst:		BURST must be enabled on the PDMA side
+ * @psd_size:		If set, PSdata is used by the endpoint
  * @mapped_channel_id:	PKTDMA thread to channel mapping for mapped channels.
  *			The thread must be serviced by the specified channel if
  *			mapped_channel_id is >= 0 in case of PKTDMA
@@ -62,23 +62,22 @@ enum psil_endpoint_type {
  */
 struct psil_endpoint_config {
 	enum psil_endpoint_type ep_type;
+	enum udma_tp_level channel_tpl;
 
 	unsigned pkt_mode:1;
 	unsigned notdpkt:1;
 	unsigned needs_epib:1;
-	u32 psd_size;
-	enum udma_tp_level channel_tpl;
-
 	/* PDMA properties, valid for PSIL_EP_PDMA_* */
 	unsigned pdma_acc32:1;
 	unsigned pdma_burst:1;
 
+	u32 psd_size;
 	/* PKDMA mapped channel */
-	int mapped_channel_id;
+	s16 mapped_channel_id;
 	/* PKTDMA tflow and rflow ranges for mapped channel */
 	u16 flow_start;
 	u16 flow_num;
-	u16 default_flow_id;
+	s16 default_flow_id;
 };
 
 int psil_set_new_ep_config(struct device *dev, const char *name,
-- 
cgit v1.2.3


From 1af7e7f8c12f521c111bd7cf0d138be7e15b51a5 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 28 Jan 2021 09:55:01 -0500
Subject: NFS: Refactor nfs_readpage() and nfs_readpage_async() to use
 nfs_readdesc

Both nfs_readpage() and nfs_readpages() use similar code.
This patch should be no functional change, and refactors
nfs_readpage_async() to use nfs_readdesc to enable future
merging of nfs_readpage_async() and nfs_readpage_async_filler().

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/read.c          | 62 ++++++++++++++++++++++++--------------------------
 include/linux/nfs_fs.h |  3 +--
 2 files changed, 31 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 464077daf62f..8c05e56dab65 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -114,18 +114,23 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
 	nfs_release_request(req);
 }
 
-int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+struct nfs_readdesc {
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_open_context *ctx;
+};
+
+int nfs_readpage_async(void *data, struct inode *inode,
 		       struct page *page)
 {
+	struct nfs_readdesc *desc = data;
 	struct nfs_page	*new;
 	unsigned int len;
-	struct nfs_pageio_descriptor pgio;
 	struct nfs_pgio_mirror *pgm;
 
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	new = nfs_create_request(ctx, page, 0, len);
+	new = nfs_create_request(desc->ctx, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		return PTR_ERR(new);
@@ -133,21 +138,21 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_SIZE)
 		zero_user_segment(page, len, PAGE_SIZE);
 
-	nfs_pageio_init_read(&pgio, inode, false,
+	nfs_pageio_init_read(&desc->pgio, inode, false,
 			     &nfs_async_read_completion_ops);
-	if (!nfs_pageio_add_request(&pgio, new)) {
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
 		nfs_list_remove_request(new);
-		nfs_readpage_release(new, pgio.pg_error);
+		nfs_readpage_release(new, desc->pgio.pg_error);
 	}
-	nfs_pageio_complete(&pgio);
+	nfs_pageio_complete(&desc->pgio);
 
 	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+	WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1);
 
-	pgm = &pgio.pg_mirrors[0];
+	pgm = &desc->pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 
-	return pgio.pg_error < 0 ? pgio.pg_error : 0;
+	return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0;
 }
 
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -312,7 +317,7 @@ static void nfs_readpage_result(struct rpc_task *task,
  */
 int nfs_readpage(struct file *file, struct page *page)
 {
-	struct nfs_open_context *ctx;
+	struct nfs_readdesc desc;
 	struct inode *inode = page_file_mapping(page)->host;
 	int ret;
 
@@ -339,39 +344,34 @@ int nfs_readpage(struct file *file, struct page *page)
 
 	if (file == NULL) {
 		ret = -EBADF;
-		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
-		if (ctx == NULL)
+		desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (desc.ctx == NULL)
 			goto out_unlock;
 	} else
-		ctx = get_nfs_open_context(nfs_file_open_context(file));
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
 
 	if (!IS_SYNC(inode)) {
-		ret = nfs_readpage_from_fscache(ctx, inode, page);
+		ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
 		if (ret == 0)
 			goto out;
 	}
 
-	xchg(&ctx->error, 0);
-	ret = nfs_readpage_async(ctx, inode, page);
+	xchg(&desc.ctx->error, 0);
+	ret = nfs_readpage_async(&desc, inode, page);
 	if (!ret) {
 		ret = wait_on_page_locked_killable(page);
 		if (!PageUptodate(page) && !ret)
-			ret = xchg(&ctx->error, 0);
+			ret = xchg(&desc.ctx->error, 0);
 	}
 	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 out:
-	put_nfs_open_context(ctx);
+	put_nfs_open_context(desc.ctx);
 	return ret;
 out_unlock:
 	unlock_page(page);
 	return ret;
 }
 
-struct nfs_readdesc {
-	struct nfs_pageio_descriptor *pgio;
-	struct nfs_open_context *ctx;
-};
-
 static int
 readpage_async_filler(void *data, struct page *page)
 {
@@ -390,9 +390,9 @@ readpage_async_filler(void *data, struct page *page)
 
 	if (len < PAGE_SIZE)
 		zero_user_segment(page, len, PAGE_SIZE);
-	if (!nfs_pageio_add_request(desc->pgio, new)) {
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
 		nfs_list_remove_request(new);
-		error = desc->pgio->pg_error;
+		error = desc->pgio.pg_error;
 		nfs_readpage_release(new, error);
 		goto out;
 	}
@@ -407,7 +407,6 @@ out:
 int nfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct nfs_pageio_descriptor pgio;
 	struct nfs_pgio_mirror *pgm;
 	struct nfs_readdesc desc;
 	struct inode *inode = mapping->host;
@@ -440,17 +439,16 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	desc.pgio = &pgio;
-	nfs_pageio_init_read(&pgio, inode, false,
+	nfs_pageio_init_read(&desc.pgio, inode, false,
 			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	nfs_pageio_complete(&pgio);
+	nfs_pageio_complete(&desc.pgio);
 
 	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+	WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1);
 
-	pgm = &pgio.pg_mirrors[0];
+	pgm = &desc.pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
 		 PAGE_SHIFT;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 681ed98e4ba8..cb0248a34518 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -570,8 +570,7 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
-			       struct page *);
+extern int  nfs_readpage_async(void *, struct inode *, struct page *);
 
 /*
  * inline functions
-- 
cgit v1.2.3


From 1e83b173b2663b7d357309584678cf24787f0713 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 28 Jan 2021 09:55:03 -0500
Subject: NFS: Add nfs_pageio_complete_read() and remove nfs_readpage_async()

Add nfs_pageio_complete_read() and call this from both nfs_readpage()
and nfs_readpages(), since the submission and accounting is the same
for both functions.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/fscache.c       |   4 --
 fs/nfs/read.c          | 137 ++++++++++++++++++++++---------------------------
 include/linux/nfs_fs.h |   1 -
 3 files changed, 61 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a60df88efc40..c4c021c6ebbd 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page,
 	if (!error) {
 		SetPageUptodate(page);
 		unlock_page(page);
-	} else {
-		error = nfs_readpage_async(context, page->mapping->host, page);
-		if (error)
-			unlock_page(page);
 	}
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0ed79e6bc486..d2b6dce1f99f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
+static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
+				     struct inode *inode)
+{
+	struct nfs_pgio_mirror *pgm;
+	unsigned long npages;
+
+	nfs_pageio_complete(pgio);
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+	pgm = &pgio->pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+}
+
+
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
 	struct nfs_pgio_mirror *mirror;
@@ -119,36 +137,6 @@ struct nfs_readdesc {
 	struct nfs_open_context *ctx;
 };
 
-static int readpage_async_filler(void *data, struct page *page);
-
-int nfs_readpage_async(void *data, struct inode *inode,
-		       struct page *page)
-{
-	struct nfs_readdesc *desc = data;
-	struct nfs_pgio_mirror *pgm;
-	int error;
-
-	nfs_pageio_init_read(&desc->pgio, inode, false,
-			     &nfs_async_read_completion_ops);
-
-	error = readpage_async_filler(desc, page);
-	if (error)
-		goto out;
-
-	nfs_pageio_complete(&desc->pgio);
-
-	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1);
-
-	pgm = &desc->pgio.pg_mirrors[0];
-	NFS_I(inode)->read_io += pgm->pg_bytes_written;
-
-	return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0;
-
-out:
-	return error;
-}
-
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
 {
 	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
@@ -170,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 
 		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
 			/* note: regions of the page not covered by a
-			 * request are zeroed in nfs_readpage_async /
-			 * readpage_async_filler */
+			 * request are zeroed in readpage_async_filler */
 			if (bytes > hdr->good_bytes) {
 				/* nothing in this request was good, so zero
 				 * the full extent of the request */
@@ -303,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task,
 		nfs_readpage_retry(task, hdr);
 }
 
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+	struct nfs_readdesc *desc = data;
+	struct nfs_page *new;
+	unsigned int len;
+	int error;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+
+	new = nfs_create_request(desc->ctx, page, 0, len);
+	if (IS_ERR(new))
+		goto out_error;
+
+	if (len < PAGE_SIZE)
+		zero_user_segment(page, len, PAGE_SIZE);
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
+		nfs_list_remove_request(new);
+		error = desc->pgio.pg_error;
+		nfs_readpage_release(new, error);
+		goto out;
+	}
+	return 0;
+out_error:
+	error = PTR_ERR(new);
+	unlock_page(page);
+out:
+	return error;
+}
+
 /*
  * Read a page over NFS.
  * We read the page synchronously in the following case:
@@ -351,13 +370,20 @@ int nfs_readpage(struct file *file, struct page *page)
 	}
 
 	xchg(&desc.ctx->error, 0);
-	ret = nfs_readpage_async(&desc, inode, page);
+	nfs_pageio_init_read(&desc.pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	ret = readpage_async_filler(&desc, page);
+
+	if (!ret)
+		nfs_pageio_complete_read(&desc.pgio, inode);
+
+	ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
 	if (!ret) {
 		ret = wait_on_page_locked_killable(page);
 		if (!PageUptodate(page) && !ret)
 			ret = xchg(&desc.ctx->error, 0);
 	}
-	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 out:
 	put_nfs_open_context(desc.ctx);
 	return ret;
@@ -366,45 +392,11 @@ out_unlock:
 	return ret;
 }
 
-static int
-readpage_async_filler(void *data, struct page *page)
-{
-	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
-	struct nfs_page *new;
-	unsigned int len;
-	int error;
-
-	len = nfs_page_length(page);
-	if (len == 0)
-		return nfs_return_empty_page(page);
-
-	new = nfs_create_request(desc->ctx, page, 0, len);
-	if (IS_ERR(new))
-		goto out_error;
-
-	if (len < PAGE_SIZE)
-		zero_user_segment(page, len, PAGE_SIZE);
-	if (!nfs_pageio_add_request(&desc->pgio, new)) {
-		nfs_list_remove_request(new);
-		error = desc->pgio.pg_error;
-		nfs_readpage_release(new, error);
-		goto out;
-	}
-	return 0;
-out_error:
-	error = PTR_ERR(new);
-	unlock_page(page);
-out:
-	return error;
-}
-
 int nfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct nfs_pgio_mirror *pgm;
 	struct nfs_readdesc desc;
 	struct inode *inode = mapping->host;
-	unsigned long npages;
 	int ret;
 
 	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
@@ -437,16 +429,9 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
 			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	nfs_pageio_complete(&desc.pgio);
 
-	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1);
+	nfs_pageio_complete_read(&desc.pgio, inode);
 
-	pgm = &desc.pgio.pg_mirrors[0];
-	NFS_I(inode)->read_io += pgm->pg_bytes_written;
-	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
-		 PAGE_SHIFT;
-	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
 	put_nfs_open_context(desc.ctx);
 out:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cb0248a34518..3cfcf219e96b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -570,7 +570,6 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_async(void *, struct inode *, struct page *);
 
 /*
  * inline functions
-- 
cgit v1.2.3


From c98fe7c2a2038af2bc24f039ccdb5a65c22ba11a Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:04 -0800
Subject: vfio: option to unmap all

For the UNMAP_DMA ioctl, delete all mappings if VFIO_DMA_UNMAP_FLAG_ALL
is set.  Define the corresponding VFIO_UNMAP_ALL extension.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d1812777139f..78462599e5ed 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -46,6 +46,9 @@
  */
 #define VFIO_NOIOMMU_IOMMU		8
 
+/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
+#define VFIO_UNMAP_ALL			9
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1102,6 +1105,7 @@ struct vfio_bitmap {
  * field.  No guarantee is made to the user that arbitrary unmaps of iova
  * or size different from those used in the original mapping call will
  * succeed.
+ *
  * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
  * before unmapping IO virtual addresses. When this flag is set, the user must
  * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
@@ -1111,11 +1115,15 @@ struct vfio_bitmap {
  * indicates that the page at that offset from iova is dirty. A Bitmap of the
  * pages in the range of unmapped size is returned in the user-provided
  * vfio_bitmap.data.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
+ * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
+#define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
cgit v1.2.3


From 441e8106a238920f28e2e79cff462044551f60e2 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:07 -0800
Subject: vfio: interfaces to update vaddr

Define interfaces that allow the underlying memory object of an iova
range to be mapped to a new host virtual address in the host process:

  - VFIO_DMA_UNMAP_FLAG_VADDR for VFIO_IOMMU_UNMAP_DMA
  - VFIO_DMA_MAP_FLAG_VADDR flag for VFIO_IOMMU_MAP_DMA
  - VFIO_UPDATE_VADDR extension for VFIO_CHECK_EXTENSION

Unmap vaddr invalidates the host virtual address in an iova range, and
blocks vfio translation of host virtual addresses.  DMA to already-mapped
pages continues.  Map vaddr updates the base VA and resumes translation.
See comments in uapi/linux/vfio.h for more details.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 78462599e5ed..8ce36c1d53ca 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -49,6 +49,9 @@
 /* Supports VFIO_DMA_UNMAP_FLAG_ALL */
 #define VFIO_UNMAP_ALL			9
 
+/* Supports the vaddr flag for DMA map and unmap */
+#define VFIO_UPDATE_VADDR		10
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -1077,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
+ * unblock translation of host virtual addresses in the iova range.  The vaddr
+ * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR.  To
+ * maintain memory consistency within the user application, the updated vaddr
+ * must address the same memory object as originally mapped.  Failure to do so
+ * will result in user memory corruption and/or device misbehavior.  iova and
+ * size must match those in the original MAP_DMA call.  Protection is not
+ * changed, and the READ & WRITE flags must be 0.
  */
 struct vfio_iommu_type1_dma_map {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
 	__u64	vaddr;				/* Process virtual address */
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
@@ -1118,12 +1131,18 @@ struct vfio_bitmap {
  *
  * If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses.  iova and size
  * must be 0.  This cannot be combined with the get-dirty-bitmap flag.
+ *
+ * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
+ * virtual addresses in the iova range.  Tasks that attempt to translate an
+ * iova's vaddr will block.  DMA to already-mapped pages continues.  This
+ * cannot be combined with the get-dirty-bitmap flag.
  */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
 #define VFIO_DMA_UNMAP_FLAG_ALL		     (1 << 1)
+#define VFIO_DMA_UNMAP_FLAG_VADDR	     (1 << 2)
 	__u64	iova;				/* IO virtual address */
 	__u64	size;				/* Size of mapping (bytes) */
 	__u8    data[];
-- 
cgit v1.2.3


From ec5e32940cc9d2a8a321cb7d756fb6ae45702d03 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:10 -0800
Subject: vfio: iommu driver notify callback

Define a vfio_iommu_driver_ops notify callback, for sending events to
the driver.  Drivers are not required to provide the callback, and
may ignore any events.  The handling of events is driver specific.

Define the CONTAINER_CLOSE event, called when the container's file
descriptor is closed.  This event signifies that no further state changes
will occur via container ioctl's.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 5 +++++
 include/linux/vfio.h | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4ad8a35667a7..38779e6fd80c 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
 static int vfio_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container = filep->private_data;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (driver && driver->ops->notify)
+		driver->ops->notify(container->iommu_data,
+				    VFIO_IOMMU_CONTAINER_CLOSE);
 
 	filep->private_data = NULL;
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f45940b38a02..b7e18bde5aa8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
 
+/* events for the backend driver notify callback */
+enum vfio_iommu_notify_type {
+	VFIO_IOMMU_CONTAINER_CLOSE = 0,
+};
+
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
 				  void *data, size_t count, bool write);
 	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
 						   struct iommu_group *group);
+	void		(*notify)(void *iommu_data,
+				  enum vfio_iommu_notify_type event);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
-- 
cgit v1.2.3


From 6e736c60a9fe905b3e4f4b07171a31ad602d56d2 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:25 -0700
Subject: coresight: Introduce device access abstraction

We are about to introduce support for sysreg access to ETMv4.4+
component. Since there are generic routines that access the
registers (e.g, CS_LOCK/UNLOCK , claim/disclaim operations, timeout)
and in order to preserve the logic of these operations at a
single place we introduce an abstraction layer for the accesses
to a given device.

Link: https://lore.kernel.org/r/20210110224850.1880240-4-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-6-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |   1 +
 drivers/hwtracing/coresight/coresight-core.c       |  43 +++++
 drivers/hwtracing/coresight/coresight-cti-core.c   |   1 +
 drivers/hwtracing/coresight/coresight-etb10.c      |   1 +
 drivers/hwtracing/coresight/coresight-etm3x-core.c |   1 +
 drivers/hwtracing/coresight/coresight-etm4x-core.c |   1 +
 drivers/hwtracing/coresight/coresight-funnel.c     |   1 +
 drivers/hwtracing/coresight/coresight-replicator.c |   1 +
 drivers/hwtracing/coresight/coresight-stm.c        |   1 +
 drivers/hwtracing/coresight/coresight-tmc-core.c   |   1 +
 drivers/hwtracing/coresight/coresight-tpiu.c       |   1 +
 include/linux/coresight.h                          | 191 ++++++++++++++++++++-
 12 files changed, 241 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index a61313f320bd..867c932c7b26 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -551,6 +551,7 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	dev->platform_data = pdata;
 
 	drvdata->base = base;
+	catu_desc.access = CSDEV_ACCESS_IOMEM(base);
 	catu_desc.pdata = pdata;
 	catu_desc.dev = dev;
 	catu_desc.groups = catu_groups;
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 4ba801dffcb7..a38af8f0831b 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1458,6 +1458,48 @@ int coresight_timeout(void __iomem *addr, u32 offset, int position, int value)
 }
 EXPORT_SYMBOL_GPL(coresight_timeout);
 
+u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_relaxed_read32(&csdev->access, offset);
+}
+
+u32 coresight_read32(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_read32(&csdev->access, offset);
+}
+
+void coresight_relaxed_write32(struct coresight_device *csdev,
+			       u32 val, u32 offset)
+{
+	csdev_access_relaxed_write32(&csdev->access, val, offset);
+}
+
+void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset)
+{
+	csdev_access_write32(&csdev->access, val, offset);
+}
+
+u64 coresight_relaxed_read64(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_relaxed_read64(&csdev->access, offset);
+}
+
+u64 coresight_read64(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_read64(&csdev->access, offset);
+}
+
+void coresight_relaxed_write64(struct coresight_device *csdev,
+			       u64 val, u32 offset)
+{
+	csdev_access_relaxed_write64(&csdev->access, val, offset);
+}
+
+void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset)
+{
+	csdev_access_write64(&csdev->access, val, offset);
+}
+
 /*
  * coresight_release_platform_data: Release references to the devices connected
  * to the output port of this device.
@@ -1522,6 +1564,7 @@ struct coresight_device *coresight_register(struct coresight_desc *desc)
 	csdev->type = desc->type;
 	csdev->subtype = desc->subtype;
 	csdev->ops = desc->ops;
+	csdev->access = desc->access;
 	csdev->orphan = false;
 
 	csdev->dev.type = &coresight_dev_type[desc->type];
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index 61dbc1afd8da..b38a8db5f252 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -870,6 +870,7 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	cti_desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	dev_set_drvdata(dev, drvdata);
 
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0cf6f0b947b6..cc742561a986 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -757,6 +757,7 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-core.c b/drivers/hwtracing/coresight/coresight-etm3x-core.c
index 5bf5a5a4ce6d..3b7837cbe376 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-core.c
@@ -839,6 +839,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index 0924c376e35a..b9e01357ffad 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -1626,6 +1626,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 071c723227db..8f7c40d7d8d6 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -242,6 +242,7 @@ static int funnel_probe(struct device *dev, struct resource *res)
 		}
 		drvdata->base = base;
 		desc.groups = coresight_funnel_groups;
+		desc.access = CSDEV_ACCESS_IOMEM(base);
 	}
 
 	dev_set_drvdata(dev, drvdata);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 7e2a2b7f503f..205756fab729 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -254,6 +254,7 @@ static int replicator_probe(struct device *dev, struct resource *res)
 		}
 		drvdata->base = base;
 		desc.groups = replicator_groups;
+		desc.access = CSDEV_ACCESS_IOMEM(base);
 	}
 
 	if (fwnode_property_present(dev_fwnode(dev),
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 99791773f682..41d9a922c2d4 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -884,6 +884,7 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	ret = stm_get_stimulus_area(dev, &ch_res);
 	if (ret)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c
index 8169dff5a9f6..e61b75be66b6 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-core.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-core.c
@@ -456,6 +456,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index d5dfee9ee556..6ca396799883 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -149,6 +149,7 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	/* Disable tpiu to support older devices */
 	tpiu_disable_hw(drvdata);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 7d3c87e5b97c..6107cf4021d3 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -7,6 +7,7 @@
 #define _LINUX_CORESIGHT_H
 
 #include <linux/device.h>
+#include <linux/io.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
 
@@ -114,6 +115,32 @@ struct coresight_platform_data {
 	struct coresight_connection *conns;
 };
 
+/**
+ * struct csdev_access - Abstraction of a CoreSight device access.
+ *
+ * @io_mem	: True if the device has memory mapped I/O
+ * @base	: When io_mem == true, base address of the component
+ * @read	: Read from the given "offset" of the given instance.
+ * @write	: Write "val" to the given "offset".
+ */
+struct csdev_access {
+	bool io_mem;
+	union {
+		void __iomem *base;
+		struct {
+			u64 (*read)(u32 offset, bool relaxed, bool _64bit);
+			void (*write)(u64 val, u32 offset, bool relaxed,
+				      bool _64bit);
+		};
+	};
+};
+
+#define CSDEV_ACCESS_IOMEM(_addr)		\
+	((struct csdev_access)	{		\
+		.io_mem		= true,		\
+		.base		= (_addr),	\
+	})
+
 /**
  * struct coresight_desc - description of a component required from drivers
  * @type:	as defined by @coresight_dev_type.
@@ -125,6 +152,7 @@ struct coresight_platform_data {
  * @groups:	operations specific to this component. These will end up
  *		in the component's sysfs sub-directory.
  * @name:	name for the coresight device, also shown under sysfs.
+ * @access:	Describe access to the device
  */
 struct coresight_desc {
 	enum coresight_dev_type type;
@@ -134,6 +162,7 @@ struct coresight_desc {
 	struct device *dev;
 	const struct attribute_group **groups;
 	const char *name;
+	struct csdev_access access;
 };
 
 /**
@@ -173,7 +202,8 @@ struct coresight_sysfs_link {
  * @type:	as defined by @coresight_dev_type.
  * @subtype:	as defined by @coresight_dev_subtype.
  * @ops:	generic operations for this component, as defined
-		by @coresight_ops.
+ *		by @coresight_ops.
+ * @access:	Device i/o access abstraction for this device.
  * @dev:	The device entity associated to this component.
  * @refcnt:	keep track of what is in use.
  * @orphan:	true if the component has connections that haven't been linked.
@@ -195,6 +225,7 @@ struct coresight_device {
 	enum coresight_dev_type type;
 	union coresight_dev_subtype subtype;
 	const struct coresight_ops *ops;
+	struct csdev_access access;
 	struct device dev;
 	atomic_t *refcnt;
 	bool orphan;
@@ -326,6 +357,104 @@ struct coresight_ops {
 };
 
 #if IS_ENABLED(CONFIG_CORESIGHT)
+
+static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa,
+					      u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readl_relaxed(csa->base + offset);
+
+	return csa->read(offset, true, false);
+}
+
+static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readl(csa->base + offset);
+
+	return csa->read(offset, false, false);
+}
+
+static inline void csdev_access_relaxed_write32(struct csdev_access *csa,
+						u32 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writel_relaxed(val, csa->base + offset);
+	else
+		csa->write(val, offset, true, false);
+}
+
+static inline void csdev_access_write32(struct csdev_access *csa, u32 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writel(val, csa->base + offset);
+	else
+		csa->write(val, offset, false, false);
+}
+
+#ifdef CONFIG_64BIT
+
+static inline u64 csdev_access_relaxed_read64(struct csdev_access *csa,
+					      u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readq_relaxed(csa->base + offset);
+
+	return csa->read(offset, true, true);
+}
+
+static inline u64 csdev_access_read64(struct csdev_access *csa, u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readq(csa->base + offset);
+
+	return csa->read(offset, false, true);
+}
+
+static inline void csdev_access_relaxed_write64(struct csdev_access *csa,
+						u64 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writeq_relaxed(val, csa->base + offset);
+	else
+		csa->write(val, offset, true, true);
+}
+
+static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writeq(val, csa->base + offset);
+	else
+		csa->write(val, offset, false, true);
+}
+
+#else	/* !CONFIG_64BIT */
+
+static inline u64 csdev_access_relaxed_read64(struct csdev_access *csa,
+					      u32 offset)
+{
+	WARN_ON(1);
+	return 0;
+}
+
+static inline u64 csdev_access_read64(struct csdev_access *csa, u32 offset)
+{
+	WARN_ON(1);
+	return 0;
+}
+
+static inline void csdev_access_relaxed_write64(struct csdev_access *csa,
+						u64 val, u32 offset)
+{
+	WARN_ON(1);
+}
+
+static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 offset)
+{
+	WARN_ON(1);
+}
+#endif	/* CONFIG_64BIT */
+
 extern struct coresight_device *
 coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
@@ -343,6 +472,18 @@ extern char *coresight_alloc_device_name(struct coresight_dev_list *devs,
 					 struct device *dev);
 
 extern bool coresight_loses_context_with_cpu(struct device *dev);
+
+u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset);
+u32 coresight_read32(struct coresight_device *csdev, u32 offset);
+void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset);
+void coresight_relaxed_write32(struct coresight_device *csdev,
+			       u32 val, u32 offset);
+u64 coresight_relaxed_read64(struct coresight_device *csdev, u32 offset);
+u64 coresight_read64(struct coresight_device *csdev, u32 offset);
+void coresight_relaxed_write64(struct coresight_device *csdev,
+			       u64 val, u32 offset);
+void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset);
+
 #else
 static inline struct coresight_device *
 coresight_register(struct coresight_desc *desc) { return NULL; }
@@ -369,10 +510,54 @@ static inline bool coresight_loses_context_with_cpu(struct device *dev)
 {
 	return false;
 }
-#endif
+
+static inline u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline u32 coresight_read32(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset)
+{
+}
+
+static inline void coresight_relaxed_write32(struct coresight_device *csdev,
+					     u32 val, u32 offset)
+{
+}
+
+static inline u64 coresight_relaxed_read64(struct coresight_device *csdev,
+					   u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline u64 coresight_read64(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline void coresight_relaxed_write64(struct coresight_device *csdev,
+					     u64 val, u32 offset)
+{
+}
+
+static inline void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset)
+{
+}
+
+#endif		/* IS_ENABLED(CONFIG_CORESIGHT) */
 
 extern int coresight_get_cpu(struct device *dev);
 
 struct coresight_platform_data *coresight_get_platform_data(struct device *dev);
 
-#endif
+#endif		/* _LINUX_COREISGHT_H */
-- 
cgit v1.2.3


From 020052825e49128d381d6444d1ce079e8ca82386 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:27 -0700
Subject: coresight: Convert coresight_timeout to use access abstraction

Convert the generic routines to use the new access abstraction layer
gradually, starting with coresigth_timeout.

Link: https://lore.kernel.org/r/20210110224850.1880240-6-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-8-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |  5 ++--
 drivers/hwtracing/coresight/coresight-core.c       | 13 +++++-----
 drivers/hwtracing/coresight/coresight-etb10.c      |  5 ++--
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 30 ++++++++++++++--------
 drivers/hwtracing/coresight/coresight-stm.c        |  3 ++-
 drivers/hwtracing/coresight/coresight-tmc-core.c   | 15 ++++++-----
 drivers/hwtracing/coresight/coresight-tpiu.c       |  4 +--
 include/linux/coresight.h                          | 11 +++++---
 8 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index 867c932c7b26..d6097454d399 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -401,8 +401,9 @@ static const struct attribute_group *catu_groups[] = {
 
 static inline int catu_wait_for_ready(struct catu_drvdata *drvdata)
 {
-	return coresight_timeout(drvdata->base,
-				 CATU_STATUS, CATU_STATUS_READY, 1);
+	struct csdev_access *csa = &drvdata->csdev->access;
+
+	return coresight_timeout(csa, CATU_STATUS, CATU_STATUS_READY, 1);
 }
 
 static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index a38af8f0831b..74985068f325 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1418,23 +1418,24 @@ static void coresight_remove_conns(struct coresight_device *csdev)
 }
 
 /**
- * coresight_timeout - loop until a bit has changed to a specific state.
- * @addr: base address of the area of interest.
- * @offset: address of a register, starting from @addr.
+ * coresight_timeout - loop until a bit has changed to a specific register
+ *			state.
+ * @csa: coresight device access for the device
+ * @offset: Offset of the register from the base of the device.
  * @position: the position of the bit of interest.
  * @value: the value the bit should have.
  *
  * Return: 0 as soon as the bit has taken the desired state or -EAGAIN if
  * TIMEOUT_US has elapsed, which ever happens first.
  */
-
-int coresight_timeout(void __iomem *addr, u32 offset, int position, int value)
+int coresight_timeout(struct csdev_access *csa, u32 offset,
+		      int position, int value)
 {
 	int i;
 	u32 val;
 
 	for (i = TIMEOUT_US; i > 0; i--) {
-		val = __raw_readl(addr + offset);
+		val = csdev_access_read32(csa, offset);
 		/* waiting on the bit to go from 0 to 1 */
 		if (value) {
 			if (val & BIT(position))
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index cc742561a986..0f664aeeda93 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -252,6 +252,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 {
 	u32 ffcr;
 	struct device *dev = &drvdata->csdev->dev;
+	struct csdev_access *csa = &drvdata->csdev->access;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -263,7 +264,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 	ffcr |= ETB_FFCR_FON_MAN;
 	writel_relaxed(ffcr, drvdata->base + ETB_FFCR);
 
-	if (coresight_timeout(drvdata->base, ETB_FFCR, ETB_FFCR_BIT, 0)) {
+	if (coresight_timeout(csa, ETB_FFCR, ETB_FFCR_BIT, 0)) {
 		dev_err(dev,
 		"timeout while waiting for completion of Manual Flush\n");
 	}
@@ -271,7 +272,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 	/* disable trace capture */
 	writel_relaxed(0x0, drvdata->base + ETB_CTL_REG);
 
-	if (coresight_timeout(drvdata->base, ETB_FFSR, ETB_FFSR_BIT, 1)) {
+	if (coresight_timeout(csa, ETB_FFSR, ETB_FFSR_BIT, 1)) {
 		dev_err(dev,
 			"timeout while waiting for Formatter to Stop\n");
 	}
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index b9e01357ffad..180bb6ed9090 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -217,7 +217,9 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 {
 	int i, rc;
 	struct etmv4_config *config = &drvdata->config;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct device *etm_dev = &csdev->dev;
+	struct csdev_access *csa = &csdev->access;
 
 	CS_UNLOCK(drvdata->base);
 	etm4_enable_arch_specific(drvdata);
@@ -232,7 +234,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 	writel_relaxed(0, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.IDLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 1))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 1))
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 	if (drvdata->nr_pe)
@@ -323,7 +325,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 	writel_relaxed(1, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.IDLE to go back down to '0' */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 
@@ -587,7 +589,9 @@ static void etm4_disable_hw(void *info)
 	u32 control;
 	struct etmv4_drvdata *drvdata = info;
 	struct etmv4_config *config = &drvdata->config;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct device *etm_dev = &csdev->dev;
+	struct csdev_access *csa = &csdev->access;
 	int i;
 
 	CS_UNLOCK(drvdata->base);
@@ -615,8 +619,7 @@ static void etm4_disable_hw(void *info)
 	writel_relaxed(control, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.PMSTABLE to go to '1' */
-	if (coresight_timeout(drvdata->base, TRCSTATR,
-			      TRCSTATR_PMSTABLE_BIT, 1))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1))
 		dev_err(etm_dev,
 			"timeout while waiting for PM stable Trace Status\n");
 
@@ -1272,7 +1275,15 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 {
 	int i, ret = 0;
 	struct etmv4_save_state *state;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa;
+	struct device *etm_dev;
+
+	if (WARN_ON(!csdev))
+		return -ENODEV;
+
+	etm_dev = &csdev->dev;
+	csa = &csdev->access;
 
 	/*
 	 * As recommended by 3.4.1 ("The procedure when powering down the PE")
@@ -1287,8 +1298,7 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 	etm4_os_lock(drvdata);
 
 	/* wait for TRCSTATR.PMSTABLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR,
-			      TRCSTATR_PMSTABLE_BIT, 1)) {
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1)) {
 		dev_err(etm_dev,
 			"timeout while waiting for PM Stable Status\n");
 		etm4_os_unlock(drvdata);
@@ -1377,7 +1387,7 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 		state->trcpdcr = readl(drvdata->base + TRCPDCR);
 
 	/* wait for TRCSTATR.IDLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 1)) {
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 1)) {
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 		etm4_os_unlock(drvdata);
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 41d9a922c2d4..5927316d7a03 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -258,6 +258,7 @@ static void stm_disable(struct coresight_device *csdev,
 			struct perf_event *event)
 {
 	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+	struct csdev_access *csa = &csdev->access;
 
 	/*
 	 * For as long as the tracer isn't disabled another entity can't
@@ -270,7 +271,7 @@ static void stm_disable(struct coresight_device *csdev,
 		spin_unlock(&drvdata->spinlock);
 
 		/* Wait until the engine has completely stopped */
-		coresight_timeout(drvdata->base, STMTCSR, STMTCSR_BUSY_BIT, 0);
+		coresight_timeout(csa, STMTCSR, STMTCSR_BUSY_BIT, 0);
 
 		pm_runtime_put(csdev->dev.parent);
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c
index e61b75be66b6..4dc1ea2c19b5 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-core.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-core.c
@@ -33,16 +33,20 @@ DEFINE_CORESIGHT_DEVLIST(etr_devs, "tmc_etr");
 
 void tmc_wait_for_tmcready(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa = &csdev->access;
+
 	/* Ensure formatter, unformatter and hardware fifo are empty */
-	if (coresight_timeout(drvdata->base,
-			      TMC_STS, TMC_STS_TMCREADY_BIT, 1)) {
-		dev_err(&drvdata->csdev->dev,
+	if (coresight_timeout(csa, TMC_STS, TMC_STS_TMCREADY_BIT, 1)) {
+		dev_err(&csdev->dev,
 			"timeout while waiting for TMC to be Ready\n");
 	}
 }
 
 void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa = &csdev->access;
 	u32 ffcr;
 
 	ffcr = readl_relaxed(drvdata->base + TMC_FFCR);
@@ -51,9 +55,8 @@ void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
 	ffcr |= BIT(TMC_FFCR_FLUSHMAN_BIT);
 	writel_relaxed(ffcr, drvdata->base + TMC_FFCR);
 	/* Ensure flush completes */
-	if (coresight_timeout(drvdata->base,
-			      TMC_FFCR, TMC_FFCR_FLUSHMAN_BIT, 0)) {
-		dev_err(&drvdata->csdev->dev,
+	if (coresight_timeout(csa, TMC_FFCR, TMC_FFCR_FLUSHMAN_BIT, 0)) {
+		dev_err(&csdev->dev,
 		"timeout while waiting for completion of Manual Flush\n");
 	}
 
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index a12b6ee0a576..2ec057892799 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -86,9 +86,9 @@ static void tpiu_disable_hw(struct csdev_access *csa)
 	/* Generate manual flush */
 	csdev_access_relaxed_write32(csa, FFCR_STOP_FI | FFCR_FON_MAN, TPIU_FFCR);
 	/* Wait for flush to complete */
-	coresight_timeout(csa->base, TPIU_FFCR, FFCR_FON_MAN_BIT, 0);
+	coresight_timeout(csa, TPIU_FFCR, FFCR_FON_MAN_BIT, 0);
 	/* Wait for formatter to stop */
-	coresight_timeout(csa->base, TPIU_FFSR, FFSR_FT_STOPPED_BIT, 1);
+	coresight_timeout(csa, TPIU_FFSR, FFSR_FT_STOPPED_BIT, 1);
 
 	CS_LOCK(csa->base);
 }
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 6107cf4021d3..18bc7f9fb041 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -460,7 +460,7 @@ coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
 extern int coresight_enable(struct coresight_device *csdev);
 extern void coresight_disable(struct coresight_device *csdev);
-extern int coresight_timeout(void __iomem *addr, u32 offset,
+extern int coresight_timeout(struct csdev_access *csa, u32 offset,
 			     int position, int value);
 
 extern int coresight_claim_device(void __iomem *base);
@@ -491,8 +491,13 @@ static inline void coresight_unregister(struct coresight_device *csdev) {}
 static inline int
 coresight_enable(struct coresight_device *csdev) { return -ENOSYS; }
 static inline void coresight_disable(struct coresight_device *csdev) {}
-static inline int coresight_timeout(void __iomem *addr, u32 offset,
-				     int position, int value) { return 1; }
+
+static inline int coresight_timeout(struct csdev_access *csa, u32 offset,
+				    int position, int value)
+{
+	return 1;
+}
+
 static inline int coresight_claim_device_unlocked(void __iomem *base)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 8ce0029658ba16c52670e869b9ccde02ae7dec6b Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:28 -0700
Subject: coresight: Convert claim/disclaim operations to use access wrappers

Convert the generic CLAIM tag management APIs to use the
device access layer abstraction.

Link: https://lore.kernel.org/r/20210110224850.1880240-7-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-9-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |  6 +-
 drivers/hwtracing/coresight/coresight-core.c       | 66 +++++++++++++---------
 drivers/hwtracing/coresight/coresight-cti-core.c   | 17 +++---
 drivers/hwtracing/coresight/coresight-etb10.c      |  4 +-
 drivers/hwtracing/coresight/coresight-etm3x-core.c |  8 ++-
 drivers/hwtracing/coresight/coresight-etm4x-core.c |  4 +-
 drivers/hwtracing/coresight/coresight-funnel.c     |  6 +-
 drivers/hwtracing/coresight/coresight-replicator.c | 12 ++--
 drivers/hwtracing/coresight/coresight-tmc-etf.c    | 10 ++--
 drivers/hwtracing/coresight/coresight-tmc-etr.c    |  4 +-
 include/linux/coresight.h                          | 16 +++---
 11 files changed, 91 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index d6097454d399..9a0b9ce4a7da 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -412,6 +412,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
 	u32 control, mode;
 	struct etr_buf *etr_buf = data;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	if (catu_wait_for_ready(drvdata))
 		dev_warn(dev, "Timeout while waiting for READY\n");
@@ -422,7 +423,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
 		return -EBUSY;
 	}
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		return rc;
 
@@ -466,9 +467,10 @@ static int catu_disable_hw(struct catu_drvdata *drvdata)
 {
 	int rc = 0;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	catu_write_control(drvdata, 0);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	if (catu_wait_for_ready(drvdata)) {
 		dev_info(dev, "Timeout while waiting for READY\n");
 		rc = -EAGAIN;
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 74985068f325..0062c8935653 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -145,30 +145,32 @@ static int coresight_find_link_outport(struct coresight_device *csdev,
 	return -ENODEV;
 }
 
-static inline u32 coresight_read_claim_tags(void __iomem *base)
+static inline u32 coresight_read_claim_tags(struct coresight_device *csdev)
 {
-	return readl_relaxed(base + CORESIGHT_CLAIMCLR);
+	return csdev_access_relaxed_read32(&csdev->access, CORESIGHT_CLAIMCLR);
 }
 
-static inline bool coresight_is_claimed_self_hosted(void __iomem *base)
+static inline bool coresight_is_claimed_self_hosted(struct coresight_device *csdev)
 {
-	return coresight_read_claim_tags(base) == CORESIGHT_CLAIM_SELF_HOSTED;
+	return coresight_read_claim_tags(csdev) == CORESIGHT_CLAIM_SELF_HOSTED;
 }
 
-static inline bool coresight_is_claimed_any(void __iomem *base)
+static inline bool coresight_is_claimed_any(struct coresight_device *csdev)
 {
-	return coresight_read_claim_tags(base) != 0;
+	return coresight_read_claim_tags(csdev) != 0;
 }
 
-static inline void coresight_set_claim_tags(void __iomem *base)
+static inline void coresight_set_claim_tags(struct coresight_device *csdev)
 {
-	writel_relaxed(CORESIGHT_CLAIM_SELF_HOSTED, base + CORESIGHT_CLAIMSET);
+	csdev_access_relaxed_write32(&csdev->access, CORESIGHT_CLAIM_SELF_HOSTED,
+				     CORESIGHT_CLAIMSET);
 	isb();
 }
 
-static inline void coresight_clear_claim_tags(void __iomem *base)
+static inline void coresight_clear_claim_tags(struct coresight_device *csdev)
 {
-	writel_relaxed(CORESIGHT_CLAIM_SELF_HOSTED, base + CORESIGHT_CLAIMCLR);
+	csdev_access_relaxed_write32(&csdev->access, CORESIGHT_CLAIM_SELF_HOSTED,
+				     CORESIGHT_CLAIMCLR);
 	isb();
 }
 
@@ -182,27 +184,33 @@ static inline void coresight_clear_claim_tags(void __iomem *base)
  * Called with CS_UNLOCKed for the component.
  * Returns : 0 on success
  */
-int coresight_claim_device_unlocked(void __iomem *base)
+int coresight_claim_device_unlocked(struct coresight_device *csdev)
 {
-	if (coresight_is_claimed_any(base))
+	if (WARN_ON(!csdev))
+		return -EINVAL;
+
+	if (coresight_is_claimed_any(csdev))
 		return -EBUSY;
 
-	coresight_set_claim_tags(base);
-	if (coresight_is_claimed_self_hosted(base))
+	coresight_set_claim_tags(csdev);
+	if (coresight_is_claimed_self_hosted(csdev))
 		return 0;
 	/* There was a race setting the tags, clean up and fail */
-	coresight_clear_claim_tags(base);
+	coresight_clear_claim_tags(csdev);
 	return -EBUSY;
 }
 EXPORT_SYMBOL_GPL(coresight_claim_device_unlocked);
 
-int coresight_claim_device(void __iomem *base)
+int coresight_claim_device(struct coresight_device *csdev)
 {
 	int rc;
 
-	CS_UNLOCK(base);
-	rc = coresight_claim_device_unlocked(base);
-	CS_LOCK(base);
+	if (WARN_ON(!csdev))
+		return -EINVAL;
+
+	CS_UNLOCK(csdev->access.base);
+	rc = coresight_claim_device_unlocked(csdev);
+	CS_LOCK(csdev->access.base);
 
 	return rc;
 }
@@ -212,11 +220,14 @@ EXPORT_SYMBOL_GPL(coresight_claim_device);
  * coresight_disclaim_device_unlocked : Clear the claim tags for the device.
  * Called with CS_UNLOCKed for the component.
  */
-void coresight_disclaim_device_unlocked(void __iomem *base)
+void coresight_disclaim_device_unlocked(struct coresight_device *csdev)
 {
 
-	if (coresight_is_claimed_self_hosted(base))
-		coresight_clear_claim_tags(base);
+	if (WARN_ON(!csdev))
+		return;
+
+	if (coresight_is_claimed_self_hosted(csdev))
+		coresight_clear_claim_tags(csdev);
 	else
 		/*
 		 * The external agent may have not honoured our claim
@@ -227,11 +238,14 @@ void coresight_disclaim_device_unlocked(void __iomem *base)
 }
 EXPORT_SYMBOL_GPL(coresight_disclaim_device_unlocked);
 
-void coresight_disclaim_device(void __iomem *base)
+void coresight_disclaim_device(struct coresight_device *csdev)
 {
-	CS_UNLOCK(base);
-	coresight_disclaim_device_unlocked(base);
-	CS_LOCK(base);
+	if (WARN_ON(!csdev))
+		return;
+
+	CS_UNLOCK(csdev->access.base);
+	coresight_disclaim_device_unlocked(csdev);
+	CS_LOCK(csdev->access.base);
 }
 EXPORT_SYMBOL_GPL(coresight_disclaim_device);
 
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index b38a8db5f252..0c81eb9603ae 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -102,7 +102,7 @@ static int cti_enable_hw(struct cti_drvdata *drvdata)
 		goto cti_state_unchanged;
 
 	/* claim the device */
-	rc = coresight_claim_device(drvdata->base);
+	rc = coresight_claim_device(drvdata->csdev);
 	if (rc)
 		goto cti_err_not_enabled;
 
@@ -136,7 +136,7 @@ static void cti_cpuhp_enable_hw(struct cti_drvdata *drvdata)
 		goto cti_hp_not_enabled;
 
 	/* try to claim the device */
-	if (coresight_claim_device(drvdata->base))
+	if (coresight_claim_device(drvdata->csdev))
 		goto cti_hp_not_enabled;
 
 	cti_write_all_hw_regs(drvdata);
@@ -154,6 +154,7 @@ static int cti_disable_hw(struct cti_drvdata *drvdata)
 {
 	struct cti_config *config = &drvdata->config;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	spin_lock(&drvdata->spinlock);
 
@@ -171,7 +172,7 @@ static int cti_disable_hw(struct cti_drvdata *drvdata)
 	writel_relaxed(0, drvdata->base + CTICONTROL);
 	config->hw_enabled = false;
 
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 	spin_unlock(&drvdata->spinlock);
 	pm_runtime_put(dev);
@@ -655,6 +656,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 			     void *v)
 {
 	struct cti_drvdata *drvdata;
+	struct coresight_device *csdev;
 	unsigned int cpu = smp_processor_id();
 	int notify_res = NOTIFY_OK;
 
@@ -662,6 +664,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		return NOTIFY_OK;
 
 	drvdata = cti_cpu_drvdata[cpu];
+	csdev = drvdata->csdev;
 
 	if (WARN_ON_ONCE(drvdata->ctidev.cpu != cpu))
 		return NOTIFY_BAD;
@@ -673,13 +676,13 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		/* CTI regs all static - we have a copy & nothing to save */
 		drvdata->config.hw_powered = false;
 		if (drvdata->config.hw_enabled)
-			coresight_disclaim_device(drvdata->base);
+			coresight_disclaim_device(csdev);
 		break;
 
 	case CPU_PM_ENTER_FAILED:
 		drvdata->config.hw_powered = true;
 		if (drvdata->config.hw_enabled) {
-			if (coresight_claim_device(drvdata->base))
+			if (coresight_claim_device(csdev))
 				drvdata->config.hw_enabled = false;
 		}
 		break;
@@ -692,7 +695,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		/* check enable reference count to enable HW */
 		if (atomic_read(&drvdata->config.enable_req_count)) {
 			/* check we can claim the device as we re-power */
-			if (coresight_claim_device(drvdata->base))
+			if (coresight_claim_device(csdev))
 				goto cti_notify_exit;
 
 			drvdata->config.hw_enabled = true;
@@ -736,7 +739,7 @@ static int cti_dying_cpu(unsigned int cpu)
 	spin_lock(&drvdata->spinlock);
 	drvdata->config.hw_powered = false;
 	if (drvdata->config.hw_enabled)
-		coresight_disclaim_device(drvdata->base);
+		coresight_disclaim_device(drvdata->csdev);
 	spin_unlock(&drvdata->spinlock);
 	return 0;
 }
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0f664aeeda93..74922c94f4b1 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -132,7 +132,7 @@ static void __etb_enable_hw(struct etb_drvdata *drvdata)
 
 static int etb_enable_hw(struct etb_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -345,7 +345,7 @@ static void etb_disable_hw(struct etb_drvdata *drvdata)
 {
 	__etb_disable_hw(drvdata);
 	etb_dump_hw(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 }
 
 static int etb_disable(struct coresight_device *csdev)
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-core.c b/drivers/hwtracing/coresight/coresight-etm3x-core.c
index 3b7837cbe376..29d4dba4bee9 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-core.c
@@ -358,10 +358,11 @@ static int etm_enable_hw(struct etm_drvdata *drvdata)
 	int i, rc;
 	u32 etmcr;
 	struct etm_config *config = &drvdata->config;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		goto done;
 
@@ -566,6 +567,7 @@ static void etm_disable_hw(void *info)
 	int i;
 	struct etm_drvdata *drvdata = info;
 	struct etm_config *config = &drvdata->config;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 	etm_set_prog(drvdata);
@@ -577,7 +579,7 @@ static void etm_disable_hw(void *info)
 		config->cntr_val[i] = etm_readl(drvdata, ETMCNTVRn(i));
 
 	etm_set_pwrdwn(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 
@@ -602,7 +604,7 @@ static void etm_disable_perf(struct coresight_device *csdev)
 	 * power down the tracer.
 	 */
 	etm_set_pwrdwn(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 }
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index 180bb6ed9090..a041ad52737f 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -226,7 +226,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 
 	etm4_os_unlock(drvdata);
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		goto done;
 
@@ -635,7 +635,7 @@ static void etm4_disable_hw(void *info)
 			readl_relaxed(drvdata->base + TRCCNTVRn(i));
 	}
 
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 8f7c40d7d8d6..38bebdddcbff 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -52,13 +52,14 @@ static int dynamic_funnel_enable_hw(struct funnel_drvdata *drvdata, int port)
 {
 	u32 functl;
 	int rc = 0;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
 	functl = readl_relaxed(drvdata->base + FUNNEL_FUNCTL);
 	/* Claim the device only when we enable the first slave */
 	if (!(functl & FUNNEL_ENSx_MASK)) {
-		rc = coresight_claim_device_unlocked(drvdata->base);
+		rc = coresight_claim_device_unlocked(csdev);
 		if (rc)
 			goto done;
 	}
@@ -101,6 +102,7 @@ static void dynamic_funnel_disable_hw(struct funnel_drvdata *drvdata,
 				      int inport)
 {
 	u32 functl;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -110,7 +112,7 @@ static void dynamic_funnel_disable_hw(struct funnel_drvdata *drvdata,
 
 	/* Disclaim the device if none of the slaves are now active */
 	if (!(functl & FUNNEL_ENSx_MASK))
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 }
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 205756fab729..a73fea9185b6 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -45,12 +45,14 @@ struct replicator_drvdata {
 
 static void dynamic_replicator_reset(struct replicator_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+
 	CS_UNLOCK(drvdata->base);
 
-	if (!coresight_claim_device_unlocked(drvdata->base)) {
+	if (!coresight_claim_device_unlocked(csdev)) {
 		writel_relaxed(0xff, drvdata->base + REPLICATOR_IDFILTER0);
 		writel_relaxed(0xff, drvdata->base + REPLICATOR_IDFILTER1);
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 	}
 
 	CS_LOCK(drvdata->base);
@@ -70,6 +72,7 @@ static int dynamic_replicator_enable(struct replicator_drvdata *drvdata,
 {
 	int rc = 0;
 	u32 id0val, id1val;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -84,7 +87,7 @@ static int dynamic_replicator_enable(struct replicator_drvdata *drvdata,
 		id0val = id1val = 0xff;
 
 	if (id0val == 0xff && id1val == 0xff)
-		rc = coresight_claim_device_unlocked(drvdata->base);
+		rc = coresight_claim_device_unlocked(csdev);
 
 	if (!rc) {
 		switch (outport) {
@@ -140,6 +143,7 @@ static void dynamic_replicator_disable(struct replicator_drvdata *drvdata,
 				       int inport, int outport)
 {
 	u32 reg;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	switch (outport) {
 	case 0:
@@ -160,7 +164,7 @@ static void dynamic_replicator_disable(struct replicator_drvdata *drvdata,
 
 	if ((readl_relaxed(drvdata->base + REPLICATOR_IDFILTER0) == 0xff) &&
 	    (readl_relaxed(drvdata->base + REPLICATOR_IDFILTER1) == 0xff))
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 989d965f3d90..45b85edfc690 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -37,7 +37,7 @@ static void __tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
 
 static int tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -88,7 +88,7 @@ static void __tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
 static void tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
 {
 	__tmc_etb_disable_hw(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 }
 
 static void __tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
@@ -109,7 +109,7 @@ static void __tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 
 static int tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -120,11 +120,13 @@ static int tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 
 static void tmc_etf_disable_hw(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+
 	CS_UNLOCK(drvdata->base);
 
 	tmc_flush_and_stop(drvdata);
 	tmc_disable_hw(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index bf5230e39c5b..acdb59e0e661 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1040,7 +1040,7 @@ static int tmc_etr_enable_hw(struct tmc_drvdata *drvdata,
 	rc = tmc_etr_enable_catu(drvdata, etr_buf);
 	if (rc)
 		return rc;
-	rc = coresight_claim_device(drvdata->base);
+	rc = coresight_claim_device(drvdata->csdev);
 	if (!rc) {
 		drvdata->etr_buf = etr_buf;
 		__tmc_etr_enable_hw(drvdata);
@@ -1134,7 +1134,7 @@ void tmc_etr_disable_hw(struct tmc_drvdata *drvdata)
 	__tmc_etr_disable_hw(drvdata);
 	/* Disable CATU device if this ETR is connected to one */
 	tmc_etr_disable_catu(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 	/* Reset the ETR buf used by hardware */
 	drvdata->etr_buf = NULL;
 }
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 18bc7f9fb041..976ec2697610 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -463,11 +463,11 @@ extern void coresight_disable(struct coresight_device *csdev);
 extern int coresight_timeout(struct csdev_access *csa, u32 offset,
 			     int position, int value);
 
-extern int coresight_claim_device(void __iomem *base);
-extern int coresight_claim_device_unlocked(void __iomem *base);
+extern int coresight_claim_device(struct coresight_device *csdev);
+extern int coresight_claim_device_unlocked(struct coresight_device *csdev);
 
-extern void coresight_disclaim_device(void __iomem *base);
-extern void coresight_disclaim_device_unlocked(void __iomem *base);
+extern void coresight_disclaim_device(struct coresight_device *csdev);
+extern void coresight_disclaim_device_unlocked(struct coresight_device *csdev);
 extern char *coresight_alloc_device_name(struct coresight_dev_list *devs,
 					 struct device *dev);
 
@@ -498,18 +498,18 @@ static inline int coresight_timeout(struct csdev_access *csa, u32 offset,
 	return 1;
 }
 
-static inline int coresight_claim_device_unlocked(void __iomem *base)
+static inline int coresight_claim_device_unlocked(struct coresight_device *csdev)
 {
 	return -EINVAL;
 }
 
-static inline int coresight_claim_device(void __iomem *base)
+static inline int coresight_claim_device(struct coresight_device *csdev)
 {
 	return -EINVAL;
 }
 
-static inline void coresight_disclaim_device(void __iomem *base) {}
-static inline void coresight_disclaim_device_unlocked(void __iomem *base) {}
+static inline void coresight_disclaim_device(struct coresight_device *csdev) {}
+static inline void coresight_disclaim_device_unlocked(struct coresight_device *csdev) {}
 
 static inline bool coresight_loses_context_with_cpu(struct device *dev)
 {
-- 
cgit v1.2.3


From c30f259a213802d5f4440fbb02e73b3b03403049 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Feb 2021 11:59:32 -0500
Subject: rpcrdma: Capture bytes received in Receive completion tracepoints

Make it easier to spot messages of an unusual size.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Tom Talpey <tom@talpey.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 50 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 76e85e16854b..c838e7ac1c2d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -60,6 +60,51 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
 				),					\
 				TP_ARGS(wc, cid))
 
+DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
+	TP_PROTO(
+		const struct ib_wc *wc,
+		const struct rpc_rdma_cid *cid
+	),
+
+	TP_ARGS(wc, cid),
+
+	TP_STRUCT__entry(
+		__field(u32, cq_id)
+		__field(int, completion_id)
+		__field(u32, received)
+		__field(unsigned long, status)
+		__field(unsigned int, vendor_err)
+	),
+
+	TP_fast_assign(
+		__entry->cq_id = cid->ci_queue_id;
+		__entry->completion_id = cid->ci_completion_id;
+		__entry->status = wc->status;
+		if (wc->status) {
+			__entry->received = 0;
+			__entry->vendor_err = wc->vendor_err;
+		} else {
+			__entry->received = wc->byte_len;
+			__entry->vendor_err = 0;
+		}
+	),
+
+	TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x) received=%u",
+		__entry->cq_id, __entry->completion_id,
+		rdma_show_wc_status(__entry->status),
+		__entry->status, __entry->vendor_err,
+		__entry->received
+	)
+);
+
+#define DEFINE_RECEIVE_COMPLETION_EVENT(name)				\
+		DEFINE_EVENT(rpcrdma_receive_completion_class, name,	\
+				TP_PROTO(				\
+					const struct ib_wc *wc,		\
+					const struct rpc_rdma_cid *cid	\
+				),					\
+				TP_ARGS(wc, cid))
+
 DECLARE_EVENT_CLASS(xprtrdma_reply_class,
 	TP_PROTO(
 		const struct rpcrdma_rep *rep
@@ -838,7 +883,8 @@ TRACE_EVENT(xprtrdma_post_linv_err,
  ** Completion events
  **/
 
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
+DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
+
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
@@ -1790,7 +1836,7 @@ TRACE_EVENT(svcrdma_post_recv,
 	)
 );
 
-DEFINE_COMPLETION_EVENT(svcrdma_wc_receive);
+DEFINE_RECEIVE_COMPLETION_EVENT(svcrdma_wc_receive);
 
 TRACE_EVENT(svcrdma_rq_post_err,
 	TP_PROTO(
-- 
cgit v1.2.3


From 167790abb90fa073d8341ee0e408ccad3d2109cd Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 22 Jan 2021 15:06:29 +0800
Subject: soundwire: export sdw_write/read_no_pm functions

sdw_write_no_pm and sdw_read_no_pm are useful when we want to do IO
without touching PM.

Fixes: 0231453bc08f ('soundwire: bus: add clock stop helpers')
Fixes: 60ee9be25571 ('soundwire: bus: add PM/no-PM versions of read/write functions')
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210122070634.12825-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 7 ++++---
 include/linux/soundwire/sdw.h | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 675d028e923d..45e9fc9f472a 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -407,10 +407,11 @@ sdw_nwrite_no_pm(struct sdw_slave *slave, u32 addr, size_t count, u8 *val)
 	return sdw_transfer(slave->bus, &msg);
 }
 
-static int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value)
+int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value)
 {
 	return sdw_nwrite_no_pm(slave, addr, 1, &value);
 }
+EXPORT_SYMBOL(sdw_write_no_pm);
 
 static int
 sdw_bread_no_pm(struct sdw_bus *bus, u16 dev_num, u32 addr)
@@ -478,8 +479,7 @@ int sdw_bwrite_no_pm_unlocked(struct sdw_bus *bus, u16 dev_num, u32 addr, u8 val
 }
 EXPORT_SYMBOL(sdw_bwrite_no_pm_unlocked);
 
-static int
-sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
+int sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
 {
 	u8 buf;
 	int ret;
@@ -490,6 +490,7 @@ sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
 	else
 		return buf;
 }
+EXPORT_SYMBOL(sdw_read_no_pm);
 
 static int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
 {
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index f0b01b728640..d08039d65825 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1005,6 +1005,8 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus);
 
 int sdw_read(struct sdw_slave *slave, u32 addr);
 int sdw_write(struct sdw_slave *slave, u32 addr, u8 value);
+int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value);
+int sdw_read_no_pm(struct sdw_slave *slave, u32 addr);
 int sdw_nread(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 
-- 
cgit v1.2.3


From a006050575745ca2be25118b90f1c37f454ac542 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:25 +0100
Subject: module: use RCU to synchronize find_module

Allow for a RCU-sched critical section around find_module, following
the lower level find_module_all helper, and switch the two callers
outside of module.c to use such a RCU-sched critical section instead
of module_mutex.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h      | 2 +-
 kernel/livepatch/core.c     | 5 +++--
 kernel/module.c             | 1 -
 kernel/trace/trace_kprobe.c | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7a0bcb5b1ffc..a64aa84d1b18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -586,7 +586,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 	return within_module_init(addr, mod) || within_module_core(addr, mod);
 }
 
-/* Search for module by name: must hold module_mutex. */
+/* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
 struct symsearch {
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f76fdb925532..262cd9b003b9 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -19,6 +19,7 @@
 #include <linux/moduleloader.h>
 #include <linux/completion.h>
 #include <linux/memory.h>
+#include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
 #include "core.h"
 #include "patch.h"
@@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj)
 	if (!klp_is_module(obj))
 		return;
 
-	mutex_lock(&module_mutex);
+	rcu_read_lock_sched();
 	/*
 	 * We do not want to block removal of patched modules and therefore
 	 * we do not take a reference here. The patches are removed by
@@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj)
 	if (mod && mod->klp_alive)
 		obj->mod = mod;
 
-	mutex_unlock(&module_mutex);
+	rcu_read_unlock_sched();
 }
 
 static bool klp_initialized(void)
diff --git a/kernel/module.c b/kernel/module.c
index 8fb16e704b89..63cc03393a07 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -668,7 +668,6 @@ static struct module *find_module_all(const char *name, size_t len,
 
 struct module *find_module(const char *name)
 {
-	module_assert_mutex();
 	return find_module_all(name, strlen(name), false);
 }
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e6fba1798771..3137992baa5e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 	if (!p)
 		return true;
 	*p = '\0';
-	mutex_lock(&module_mutex);
+	rcu_read_lock_sched();
 	ret = !!find_module(tk->symbol);
-	mutex_unlock(&module_mutex);
+	rcu_read_unlock_sched();
 	*p = ':';
 
 	return ret;
-- 
cgit v1.2.3


From 3e3552056ab42f883d7723eeb42fed712b66bacf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:27 +0100
Subject: kallsyms: only build {,module_}kallsyms_on_each_symbol when required

kallsyms_on_each_symbol and module_kallsyms_on_each_symbol are only used
by the livepatching code, so don't build them if livepatching is not
enabled.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/kallsyms.h | 17 ++++-------------
 include/linux/module.h   | 16 ++++------------
 kernel/kallsyms.c        |  2 ++
 kernel/module.c          |  2 ++
 4 files changed, 12 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 481273f0c72d..465060acc981 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -71,15 +71,14 @@ static inline void *dereference_symbol_descriptor(void *ptr)
 	return ptr;
 }
 
-#ifdef CONFIG_KALLSYMS
-/* Lookup the address for a symbol. Returns 0 if not found. */
-unsigned long kallsyms_lookup_name(const char *name);
-
-/* Call a function on each kallsyms symbol in the core kernel */
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 				      unsigned long),
 			    void *data);
 
+#ifdef CONFIG_KALLSYMS
+/* Lookup the address for a symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -108,14 +107,6 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
-static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-						    struct module *,
-						    unsigned long),
-					  void *data)
-{
-	return 0;
-}
-
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index a64aa84d1b18..3ea4ffae608f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -608,10 +608,6 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
-				   void *data);
-
 extern void __noreturn __module_put_and_exit(struct module *mod,
 			long code);
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
@@ -795,14 +791,6 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
-static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-							   struct module *,
-							   unsigned long),
-						 void *data)
-{
-	return 0;
-}
-
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
@@ -891,4 +879,8 @@ static inline bool module_sig_ok(struct module *module)
 }
 #endif	/* CONFIG_MODULE_SIG */
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 #endif /* _LINUX_MODULE_H */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a0d3f0865916..8043a90aa50e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -177,6 +177,7 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+#ifdef CONFIG_LIVEPATCH
 /*
  * Iterate over all symbols in vmlinux.  For symbols from modules use
  * module_kallsyms_on_each_symbol instead.
@@ -198,6 +199,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 	}
 	return 0;
 }
+#endif /* CONFIG_LIVEPATCH */
 
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
diff --git a/kernel/module.c b/kernel/module.c
index 481eb348564e..06155c454779 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4488,6 +4488,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	return ret;
 }
 
+#ifdef CONFIG_LIVEPATCH
 int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data)
@@ -4518,6 +4519,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 	mutex_unlock(&module_mutex);
 	return ret;
 }
+#endif /* CONFIG_LIVEPATCH */
 #endif /* CONFIG_KALLSYMS */
 
 /* Maximum number of characters written by module_flags() */
-- 
cgit v1.2.3


From 922f2a7c822bf76dffb218331bd95b1eea3cf637 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:28 +0100
Subject: module: mark module_mutex static

Except for two lockdep asserts module_mutex is only used in module.c.
Remove the two asserts given that the functions they are in are not
exported and just called from the module code, and mark module_mutex
static.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 --
 kernel/module.c        | 2 +-
 lib/bug.c              | 3 ---
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 3ea4ffae608f..0f360c48fe92 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -550,8 +550,6 @@ static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
 }
 #endif
 
-extern struct mutex module_mutex;
-
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
diff --git a/kernel/module.c b/kernel/module.c
index 06155c454779..9befd793997e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,7 +87,7 @@
  * 3) module_addr_min/module_addr_max.
  * (delete and add uses RCU list operations).
  */
-DEFINE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
 /* Work queue for freeing init sections in success case */
diff --git a/lib/bug.c b/lib/bug.c
index 7103440c0ee1..8f9d537bfb2a 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -91,8 +91,6 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 	char *secstrings;
 	unsigned int i;
 
-	lockdep_assert_held(&module_mutex);
-
 	mod->bug_table = NULL;
 	mod->num_bugs = 0;
 
@@ -118,7 +116,6 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 
 void module_bug_cleanup(struct module *mod)
 {
-	lockdep_assert_held(&module_mutex);
 	list_del_rcu(&mod->bug_list);
 }
 
-- 
cgit v1.2.3


From 00cc2c1cd34f81f777085cb2d65267edcd403fd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:32 +0100
Subject: module: move struct symsearch to module.c

struct symsearch is only used inside of module.h, so move the definition
out of module.h.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 11 -----------
 kernel/module.c        | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0f360c48fe92..da0f5966ee80 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -587,17 +587,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 /* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const s32 *crcs;
-	enum mod_license {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} license;
-	bool unused;
-};
-
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
diff --git a/kernel/module.c b/kernel/module.c
index 2e7ddf544338..9cd6814405b9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -428,6 +428,17 @@ extern const s32 __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const s32 *crcs;
+	enum mod_license {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} license;
+	bool unused;
+};
+
 struct find_symbol_arg {
 	/* Input */
 	const char *name;
-- 
cgit v1.2.3


From f1c3d73e973cfad85ff5d3d86086503e742d8c62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:33 +0100
Subject: module: remove EXPORT_SYMBOL_GPL_FUTURE

As far as I can tell this has never been used at all, and certainly
not any time recently.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 arch/x86/tools/relocs.c           |  4 ++--
 include/asm-generic/vmlinux.lds.h | 14 --------------
 include/linux/export.h            |  1 -
 include/linux/module.h            |  5 -----
 kernel/module.c                   | 29 ++---------------------------
 scripts/mod/modpost.c             | 13 +------------
 scripts/mod/modpost.h             |  1 -
 scripts/module.lds.S              |  2 --
 tools/include/linux/export.h      |  1 -
 9 files changed, 5 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index ce7188cbdae5..0d210d0e83e2 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
 	"__(start|end)_pci_.*|"
 	"__(start|end)_builtin_fw|"
-	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
-	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl)|"
+	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl)|"
 	"__(start|stop)___param|"
 	"__(start|stop)___modver|"
 	"__(start|stop)___bug_table|"
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..83243506e68b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -495,13 +495,6 @@
 		__stop___ksymtab_unused_gpl = .;			\
 	}								\
 									\
-	/* Kernel symbol table: GPL-future-only symbols */		\
-	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
-		__start___ksymtab_gpl_future = .;			\
-		KEEP(*(SORT(___ksymtab_gpl_future+*)))			\
-		__stop___ksymtab_gpl_future = .;			\
-	}								\
-									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		__start___kcrctab = .;					\
@@ -530,13 +523,6 @@
 		__stop___kcrctab_unused_gpl = .;			\
 	}								\
 									\
-	/* Kernel symbol table: GPL-future-only symbols */		\
-	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
-		__start___kcrctab_gpl_future = .;			\
-		KEEP(*(SORT(___kcrctab_gpl_future+*)))			\
-		__stop___kcrctab_gpl_future = .;			\
-	}								\
-									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/export.h b/include/linux/export.h
index fceb5e855717..362b64f8d4a7 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -157,7 +157,6 @@ struct kernel_symbol {
 
 #define EXPORT_SYMBOL(sym)		_EXPORT_SYMBOL(sym, "")
 #define EXPORT_SYMBOL_GPL(sym)		_EXPORT_SYMBOL(sym, "_gpl")
-#define EXPORT_SYMBOL_GPL_FUTURE(sym)	_EXPORT_SYMBOL(sym, "_gpl_future")
 #define EXPORT_SYMBOL_NS(sym, ns)	__EXPORT_SYMBOL(sym, "", #ns)
 #define EXPORT_SYMBOL_NS_GPL(sym, ns)	__EXPORT_SYMBOL(sym, "_gpl", #ns)
 
diff --git a/include/linux/module.h b/include/linux/module.h
index da0f5966ee80..e6e50ee30412 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -411,11 +411,6 @@ struct module {
 
 	bool async_probe_requested;
 
-	/* symbols that will be GPL-only in the near future. */
-	const struct kernel_symbol *gpl_future_syms;
-	const s32 *gpl_future_crcs;
-	unsigned int num_gpl_future_syms;
-
 	/* Exception table */
 	unsigned int num_exentries;
 	struct exception_table_entry *extable;
diff --git a/kernel/module.c b/kernel/module.c
index 9cd6814405b9..95186c9d81ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -408,11 +408,8 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const s32 __start___kcrctab[];
 extern const s32 __start___kcrctab_gpl[];
-extern const s32 __start___kcrctab_gpl_future[];
 #ifdef CONFIG_UNUSED_SYMBOLS
 extern const struct kernel_symbol __start___ksymtab_unused[];
 extern const struct kernel_symbol __stop___ksymtab_unused[];
@@ -434,7 +431,6 @@ struct symsearch {
 	enum mod_license {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
 	} license;
 	bool unused;
 };
@@ -458,15 +454,8 @@ static bool check_exported_symbol(const struct symsearch *syms,
 {
 	struct find_symbol_arg *fsa = data;
 
-	if (!fsa->gplok) {
-		if (syms->license == GPL_ONLY)
-			return false;
-		if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
-			pr_warn("Symbol %s is being used by a non-GPL module, "
-				"which will not be allowed in the future\n",
-				fsa->name);
-		}
-	}
+	if (!fsa->gplok && syms->license == GPL_ONLY)
+		return false;
 
 #ifdef CONFIG_UNUSED_SYMBOLS
 	if (syms->unused && fsa->warn) {
@@ -550,9 +539,6 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
 		  __start___kcrctab_gpl,
 		  GPL_ONLY, false },
-		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-		  __start___kcrctab_gpl_future,
-		  WILL_BE_GPL_ONLY, false },
 #ifdef CONFIG_UNUSED_SYMBOLS
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
 		  __start___kcrctab_unused,
@@ -579,10 +565,6 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
 			  mod->gpl_crcs,
 			  GPL_ONLY, false },
-			{ mod->gpl_future_syms,
-			  mod->gpl_future_syms + mod->num_gpl_future_syms,
-			  mod->gpl_future_crcs,
-			  WILL_BE_GPL_ONLY, false },
 #ifdef CONFIG_UNUSED_SYMBOLS
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
@@ -2292,7 +2274,6 @@ static int verify_exported_symbols(struct module *mod)
 	} arr[] = {
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
-		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
 #ifdef CONFIG_UNUSED_SYMBOLS
 		{ mod->unused_syms, mod->num_unused_syms },
 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
@@ -3308,11 +3289,6 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 				     sizeof(*mod->gpl_syms),
 				     &mod->num_gpl_syms);
 	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
-	mod->gpl_future_syms = section_objs(info,
-					    "__ksymtab_gpl_future",
-					    sizeof(*mod->gpl_future_syms),
-					    &mod->num_gpl_future_syms);
-	mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
 
 #ifdef CONFIG_UNUSED_SYMBOLS
 	mod->unused_syms = section_objs(info, "__ksymtab_unused",
@@ -3506,7 +3482,6 @@ static int check_module_license_and_versions(struct module *mod)
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-	    || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
 #ifdef CONFIG_UNUSED_SYMBOLS
 	    || (mod->num_unused_syms && !mod->unused_crcs)
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index d6c81657d695..25c1446055d1 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -44,7 +44,7 @@ static bool error_occurred;
 
 enum export {
 	export_plain,      export_unused,     export_gpl,
-	export_unused_gpl, export_gpl_future, export_unknown
+	export_unused_gpl, export_unknown
 };
 
 /* In kernel, this size is defined in linux/module.h;
@@ -304,7 +304,6 @@ static const struct {
 	{ .str = "EXPORT_UNUSED_SYMBOL",     .export = export_unused },
 	{ .str = "EXPORT_SYMBOL_GPL",        .export = export_gpl },
 	{ .str = "EXPORT_UNUSED_SYMBOL_GPL", .export = export_unused_gpl },
-	{ .str = "EXPORT_SYMBOL_GPL_FUTURE", .export = export_gpl_future },
 	{ .str = "(unknown)",                .export = export_unknown },
 };
 
@@ -369,8 +368,6 @@ static enum export export_from_secname(struct elf_info *elf, unsigned int sec)
 		return export_gpl;
 	else if (strstarts(secname, "___ksymtab_unused_gpl+"))
 		return export_unused_gpl;
-	else if (strstarts(secname, "___ksymtab_gpl_future+"))
-		return export_gpl_future;
 	else
 		return export_unknown;
 }
@@ -385,8 +382,6 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 		return export_gpl;
 	else if (sec == elf->export_unused_gpl_sec)
 		return export_unused_gpl;
-	else if (sec == elf->export_gpl_future_sec)
-		return export_gpl_future;
 	else
 		return export_unknown;
 }
@@ -596,8 +591,6 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->export_gpl_sec = i;
 		else if (strcmp(secname, "__ksymtab_unused_gpl") == 0)
 			info->export_unused_gpl_sec = i;
-		else if (strcmp(secname, "__ksymtab_gpl_future") == 0)
-			info->export_gpl_future_sec = i;
 
 		if (sechdrs[i].sh_type == SHT_SYMTAB) {
 			unsigned int sh_link_idx;
@@ -2152,10 +2145,6 @@ static void check_for_gpl_usage(enum export exp, const char *m, const char *s)
 		error("GPL-incompatible module %s.ko uses GPL-only symbol marked UNUSED '%s'\n",
 		      m, s);
 		break;
-	case export_gpl_future:
-		warn("GPL-incompatible module %s.ko uses future GPL-only symbol '%s'\n",
-		     m, s);
-		break;
 	case export_plain:
 	case export_unused:
 	case export_unknown:
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index e6f46eee0af0..834220de002b 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -142,7 +142,6 @@ struct elf_info {
 	Elf_Section  export_unused_sec;
 	Elf_Section  export_gpl_sec;
 	Elf_Section  export_unused_gpl_sec;
-	Elf_Section  export_gpl_future_sec;
 	char         *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 69b9b71a6a47..d82b452e8a71 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -13,12 +13,10 @@ SECTIONS {
 	__ksymtab_gpl		0 : { *(SORT(___ksymtab_gpl+*)) }
 	__ksymtab_unused	0 : { *(SORT(___ksymtab_unused+*)) }
 	__ksymtab_unused_gpl	0 : { *(SORT(___ksymtab_unused_gpl+*)) }
-	__ksymtab_gpl_future	0 : { *(SORT(___ksymtab_gpl_future+*)) }
 	__kcrctab		0 : { *(SORT(___kcrctab+*)) }
 	__kcrctab_gpl		0 : { *(SORT(___kcrctab_gpl+*)) }
 	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
 	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
-	__kcrctab_gpl_future	0 : { *(SORT(___kcrctab_gpl_future+*)) }
 
 	.init_array		0 : ALIGN(8) { *(SORT(.init_array.*)) *(.init_array) }
 
diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h
index d07e586b9ba0..9f61349a8944 100644
--- a/tools/include/linux/export.h
+++ b/tools/include/linux/export.h
@@ -3,7 +3,6 @@
 
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
-#define EXPORT_SYMBOL_GPL_FUTURE(sym)
 #define EXPORT_UNUSED_SYMBOL(sym)
 #define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
-- 
cgit v1.2.3


From 367948220fcefcad1bf0d3d595a06efe0694acae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:34 +0100
Subject: module: remove EXPORT_UNUSED_SYMBOL*

EXPORT_UNUSED_SYMBOL* is not actually used anywhere.  Remove the
unused functionality as we generally just remove unused code anyway.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 arch/arm/configs/bcm2835_defconfig          |  1 -
 arch/arm/configs/mxs_defconfig              |  1 -
 arch/mips/configs/nlm_xlp_defconfig         |  1 -
 arch/mips/configs/nlm_xlr_defconfig         |  1 -
 arch/parisc/configs/generic-32bit_defconfig |  1 -
 arch/parisc/configs/generic-64bit_defconfig |  1 -
 arch/powerpc/configs/ppc6xx_defconfig       |  1 -
 arch/s390/configs/debug_defconfig           |  1 -
 arch/s390/configs/defconfig                 |  1 -
 arch/sh/configs/edosk7760_defconfig         |  1 -
 arch/sh/configs/sdk7780_defconfig           |  1 -
 arch/x86/configs/i386_defconfig             |  1 -
 arch/x86/configs/x86_64_defconfig           |  1 -
 arch/x86/tools/relocs.c                     |  4 +-
 include/asm-generic/vmlinux.lds.h           | 28 ------------
 include/linux/export.h                      |  8 ----
 include/linux/module.h                      | 12 -----
 init/Kconfig                                | 17 -------
 kernel/module.c                             | 71 +++--------------------------
 scripts/checkpatch.pl                       |  6 +--
 scripts/mod/modpost.c                       | 39 ++--------------
 scripts/mod/modpost.h                       |  2 -
 scripts/module.lds.S                        |  4 --
 tools/include/linux/export.h                |  2 -
 24 files changed, 13 insertions(+), 193 deletions(-)

(limited to 'include')

diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig
index 44ff9cd88d81..d6c6c2e031c4 100644
--- a/arch/arm/configs/bcm2835_defconfig
+++ b/arch/arm/configs/bcm2835_defconfig
@@ -177,7 +177,6 @@ CONFIG_BOOT_PRINTK_DELAY=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHED_TRACER=y
diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig
index a9c6f32a9b1c..ca32446b187f 100644
--- a/arch/arm/configs/mxs_defconfig
+++ b/arch/arm/configs/mxs_defconfig
@@ -164,7 +164,6 @@ CONFIG_FONTS=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 CONFIG_FRAME_WARN=2048
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/mips/configs/nlm_xlp_defconfig b/arch/mips/configs/nlm_xlp_defconfig
index 72a211d2d556..32c290611723 100644
--- a/arch/mips/configs/nlm_xlp_defconfig
+++ b/arch/mips/configs/nlm_xlp_defconfig
@@ -549,7 +549,6 @@ CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_FRAME_WARN=1024
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_SCHEDSTATS=y
diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig
index 4ecb157e56d4..bf9b9244929e 100644
--- a/arch/mips/configs/nlm_xlr_defconfig
+++ b/arch/mips/configs/nlm_xlr_defconfig
@@ -500,7 +500,6 @@ CONFIG_CRC7=m
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_SCHEDSTATS=y
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 3cbcfad5f724..7611d48c599e 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -22,7 +22,6 @@ CONFIG_PCI_LBA=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_UNUSED_SYMBOLS=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=m
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index 8f81fcbf04c4..53054b81461a 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -31,7 +31,6 @@ CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BINFMT_MISC=m
 # CONFIG_COMPACTION is not set
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index ef09f3cce1fa..34c3859040f9 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1072,7 +1072,6 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_DEBUG_INFO=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_HEADERS_INSTALL=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index c4f6ff98a612..58e54d17e315 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -71,7 +71,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 51135893cffe..b5e62c0d3e23 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -66,7 +66,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
 CONFIG_BLK_CGROUP_IOLATENCY=y
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index 02ba62298576..d77f54e906fd 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -102,7 +102,6 @@ CONFIG_NLS_UTF8=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index d10a0414123a..d53c4595fb2e 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -130,7 +130,6 @@ CONFIG_NLS_ISO8859_15=y
 CONFIG_NLS_UTF8=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 78210793d357..9c9c4a888b1d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -50,7 +50,6 @@ CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_BINFMT_MISC=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9936528e1939..b60bd2d86034 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_BINFMT_MISC=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0d210d0e83e2..b9c577a3cacc 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
 	"__(start|end)_pci_.*|"
 	"__(start|end)_builtin_fw|"
-	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl)|"
-	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl)|"
+	"__(start|stop)___ksymtab(|_gpl)|"
+	"__(start|stop)___kcrctab(|_gpl)|"
 	"__(start|stop)___param|"
 	"__(start|stop)___modver|"
 	"__(start|stop)___bug_table|"
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 83243506e68b..1fa338ac6a54 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -481,20 +481,6 @@
 		__stop___ksymtab_gpl = .;				\
 	}								\
 									\
-	/* Kernel symbol table: Normal unused symbols */		\
-	__ksymtab_unused  : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) {	\
-		__start___ksymtab_unused = .;				\
-		KEEP(*(SORT(___ksymtab_unused+*)))			\
-		__stop___ksymtab_unused = .;				\
-	}								\
-									\
-	/* Kernel symbol table: GPL-only unused symbols */		\
-	__ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \
-		__start___ksymtab_unused_gpl = .;			\
-		KEEP(*(SORT(___ksymtab_unused_gpl+*)))			\
-		__stop___ksymtab_unused_gpl = .;			\
-	}								\
-									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		__start___kcrctab = .;					\
@@ -509,20 +495,6 @@
 		__stop___kcrctab_gpl = .;				\
 	}								\
 									\
-	/* Kernel symbol table: Normal unused symbols */		\
-	__kcrctab_unused  : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) {	\
-		__start___kcrctab_unused = .;				\
-		KEEP(*(SORT(___kcrctab_unused+*)))			\
-		__stop___kcrctab_unused = .;				\
-	}								\
-									\
-	/* Kernel symbol table: GPL-only unused symbols */		\
-	__kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \
-		__start___kcrctab_unused_gpl = .;			\
-		KEEP(*(SORT(___kcrctab_unused_gpl+*)))			\
-		__stop___kcrctab_unused_gpl = .;			\
-	}								\
-									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/export.h b/include/linux/export.h
index 362b64f8d4a7..6271a5d9c988 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -160,14 +160,6 @@ struct kernel_symbol {
 #define EXPORT_SYMBOL_NS(sym, ns)	__EXPORT_SYMBOL(sym, "", #ns)
 #define EXPORT_SYMBOL_NS_GPL(sym, ns)	__EXPORT_SYMBOL(sym, "_gpl", #ns)
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-#define EXPORT_UNUSED_SYMBOL(sym)	_EXPORT_SYMBOL(sym, "_unused")
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)	_EXPORT_SYMBOL(sym, "_unused_gpl")
-#else
-#define EXPORT_UNUSED_SYMBOL(sym)
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)
-#endif
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _LINUX_EXPORT_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index e6e50ee30412..59f094fa6f74 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -392,18 +392,6 @@ struct module {
 	const s32 *gpl_crcs;
 	bool using_gplonly_symbols;
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-	/* unused exported symbols. */
-	const struct kernel_symbol *unused_syms;
-	const s32 *unused_crcs;
-	unsigned int num_unused_syms;
-
-	/* GPL-only, unused exported symbols. */
-	unsigned int num_unused_gpl_syms;
-	const struct kernel_symbol *unused_gpl_syms;
-	const s32 *unused_gpl_crcs;
-#endif
-
 #ifdef CONFIG_MODULE_SIG
 	/* Signature was verified. */
 	bool sig_ok;
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..11b803b45c19 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2262,25 +2262,8 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
 
 	  If unsure, say N.
 
-config UNUSED_SYMBOLS
-	bool "Enable unused/obsolete exported symbols"
-	default y if X86
-	help
-	  Unused but exported symbols make the kernel needlessly bigger.  For
-	  that reason most of these unused exports will soon be removed.  This
-	  option is provided temporarily to provide a transition period in case
-	  some external kernel module needs one of these symbols anyway. If you
-	  encounter such a case in your module, consider if you are actually
-	  using the right API.  (rationale: since nobody in the kernel is using
-	  this in a module, there is a pretty good chance it's actually the
-	  wrong interface to use).  If you really need the symbol, please send a
-	  mail to the linux kernel mailing list mentioning the symbol and why
-	  you really need it, and what the merge plan to the mainline kernel for
-	  your module is.
-
 config TRIM_UNUSED_KSYMS
 	bool "Trim unused exported kernel symbols"
-	depends on !UNUSED_SYMBOLS
 	help
 	  The kernel and some modules make many symbols available for
 	  other modules to use via EXPORT_SYMBOL() and variants. Depending
diff --git a/kernel/module.c b/kernel/module.c
index 95186c9d81ea..93f360250bcb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -410,14 +410,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const s32 __start___kcrctab[];
 extern const s32 __start___kcrctab_gpl[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const s32 __start___kcrctab_unused[];
-extern const s32 __start___kcrctab_unused_gpl[];
-#endif
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -432,7 +424,6 @@ struct symsearch {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
 	} license;
-	bool unused;
 };
 
 struct find_symbol_arg {
@@ -456,19 +447,6 @@ static bool check_exported_symbol(const struct symsearch *syms,
 
 	if (!fsa->gplok && syms->license == GPL_ONLY)
 		return false;
-
-#ifdef CONFIG_UNUSED_SYMBOLS
-	if (syms->unused && fsa->warn) {
-		pr_warn("Symbol %s is marked as UNUSED, however this module is "
-			"using it.\n", fsa->name);
-		pr_warn("This symbol will go away in the future.\n");
-		pr_warn("Please evaluate if this is the right api to use and "
-			"if it really is, submit a report to the linux kernel "
-			"mailing list together with submitting your code for "
-			"inclusion.\n");
-	}
-#endif
-
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
 	fsa->sym = &syms->start[symnum];
@@ -535,18 +513,10 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 {
 	static const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  NOT_GPL_ONLY, false },
+		  NOT_GPL_ONLY },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
 		  __start___kcrctab_gpl,
-		  GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
-		{ __start___ksymtab_unused, __stop___ksymtab_unused,
-		  __start___kcrctab_unused,
-		  NOT_GPL_ONLY, true },
-		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-		  __start___kcrctab_unused_gpl,
-		  GPL_ONLY, true },
-#endif
+		  GPL_ONLY },
 	};
 	struct module *mod;
 	unsigned int i;
@@ -561,20 +531,10 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 				lockdep_is_held(&module_mutex)) {
 		struct symsearch arr[] = {
 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  NOT_GPL_ONLY, false },
+			  NOT_GPL_ONLY },
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
 			  mod->gpl_crcs,
-			  GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
-			{ mod->unused_syms,
-			  mod->unused_syms + mod->num_unused_syms,
-			  mod->unused_crcs,
-			  NOT_GPL_ONLY, true },
-			{ mod->unused_gpl_syms,
-			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-			  mod->unused_gpl_crcs,
-			  GPL_ONLY, true },
-#endif
+			  GPL_ONLY },
 		};
 
 		if (mod->state == MODULE_STATE_UNFORMED)
@@ -2274,10 +2234,6 @@ static int verify_exported_symbols(struct module *mod)
 	} arr[] = {
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
-		{ mod->unused_syms, mod->num_unused_syms },
-		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
 	};
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -3290,16 +3246,6 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 				     &mod->num_gpl_syms);
 	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-	mod->unused_syms = section_objs(info, "__ksymtab_unused",
-					sizeof(*mod->unused_syms),
-					&mod->num_unused_syms);
-	mod->unused_crcs = section_addr(info, "__kcrctab_unused");
-	mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
-					    sizeof(*mod->unused_gpl_syms),
-					    &mod->num_unused_gpl_syms);
-	mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
 #ifdef CONFIG_CONSTRUCTORS
 	mod->ctors = section_objs(info, ".ctors",
 				  sizeof(*mod->ctors), &mod->num_ctors);
@@ -3480,13 +3426,8 @@ static int check_module_license_and_versions(struct module *mod)
 		pr_warn("%s: module license taints kernel.\n", mod->name);
 
 #ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !mod->crcs)
-	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
-	    || (mod->num_unused_syms && !mod->unused_crcs)
-	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
-		) {
+	if ((mod->num_syms && !mod->crcs) ||
+	    (mod->num_gpl_syms && !mod->gpl_crcs)) {
 		return try_to_force_load(mod,
 					 "no versions for exported symbols");
 	}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 92e888ed939f..eabd2d5467b1 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4290,8 +4290,7 @@ sub process {
 		if (defined $realline_next &&
 		    exists $lines[$realline_next - 1] &&
 		    !defined $suppress_export{$realline_next} &&
-		    ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/ ||
-		     $lines[$realline_next - 1] =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) {
+		    ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/)) {
 			# Handle definitions which produce identifiers with
 			# a prefix:
 			#   XXX(foo);
@@ -4318,8 +4317,7 @@ sub process {
 		}
 		if (!defined $suppress_export{$linenr} &&
 		    $prevline =~ /^.\s*$/ &&
-		    ($line =~ /EXPORT_SYMBOL.*\((.*)\)/ ||
-		     $line =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) {
+		    ($line =~ /EXPORT_SYMBOL.*\((.*)\)/)) {
 #print "FOO B <$lines[$linenr - 1]>\n";
 			$suppress_export{$linenr} = 2;
 		}
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 25c1446055d1..20fc57837881 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -43,8 +43,9 @@ static int allow_missing_ns_imports;
 static bool error_occurred;
 
 enum export {
-	export_plain,      export_unused,     export_gpl,
-	export_unused_gpl, export_unknown
+	export_plain,
+	export_gpl,
+	export_unknown
 };
 
 /* In kernel, this size is defined in linux/module.h;
@@ -301,9 +302,7 @@ static const struct {
 	enum export export;
 } export_list[] = {
 	{ .str = "EXPORT_SYMBOL",            .export = export_plain },
-	{ .str = "EXPORT_UNUSED_SYMBOL",     .export = export_unused },
 	{ .str = "EXPORT_SYMBOL_GPL",        .export = export_gpl },
-	{ .str = "EXPORT_UNUSED_SYMBOL_GPL", .export = export_unused_gpl },
 	{ .str = "(unknown)",                .export = export_unknown },
 };
 
@@ -362,12 +361,8 @@ static enum export export_from_secname(struct elf_info *elf, unsigned int sec)
 
 	if (strstarts(secname, "___ksymtab+"))
 		return export_plain;
-	else if (strstarts(secname, "___ksymtab_unused+"))
-		return export_unused;
 	else if (strstarts(secname, "___ksymtab_gpl+"))
 		return export_gpl;
-	else if (strstarts(secname, "___ksymtab_unused_gpl+"))
-		return export_unused_gpl;
 	else
 		return export_unknown;
 }
@@ -376,12 +371,8 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 {
 	if (sec == elf->export_sec)
 		return export_plain;
-	else if (sec == elf->export_unused_sec)
-		return export_unused;
 	else if (sec == elf->export_gpl_sec)
 		return export_gpl;
-	else if (sec == elf->export_unused_gpl_sec)
-		return export_unused_gpl;
 	else
 		return export_unknown;
 }
@@ -585,12 +576,8 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->modinfo_len = sechdrs[i].sh_size;
 		} else if (strcmp(secname, "__ksymtab") == 0)
 			info->export_sec = i;
-		else if (strcmp(secname, "__ksymtab_unused") == 0)
-			info->export_unused_sec = i;
 		else if (strcmp(secname, "__ksymtab_gpl") == 0)
 			info->export_gpl_sec = i;
-		else if (strcmp(secname, "__ksymtab_unused_gpl") == 0)
-			info->export_unused_gpl_sec = i;
 
 		if (sechdrs[i].sh_type == SHT_SYMTAB) {
 			unsigned int sh_link_idx;
@@ -2141,32 +2128,13 @@ static void check_for_gpl_usage(enum export exp, const char *m, const char *s)
 		error("GPL-incompatible module %s.ko uses GPL-only symbol '%s'\n",
 		      m, s);
 		break;
-	case export_unused_gpl:
-		error("GPL-incompatible module %s.ko uses GPL-only symbol marked UNUSED '%s'\n",
-		      m, s);
-		break;
 	case export_plain:
-	case export_unused:
 	case export_unknown:
 		/* ignore */
 		break;
 	}
 }
 
-static void check_for_unused(enum export exp, const char *m, const char *s)
-{
-	switch (exp) {
-	case export_unused:
-	case export_unused_gpl:
-		warn("module %s.ko uses symbol '%s' marked UNUSED\n",
-		     m, s);
-		break;
-	default:
-		/* ignore */
-		break;
-	}
-}
-
 static void check_exports(struct module *mod)
 {
 	struct symbol *s, *exp;
@@ -2197,7 +2165,6 @@ static void check_exports(struct module *mod)
 
 		if (!mod->gpl_compatible)
 			check_for_gpl_usage(exp->export, basename, exp->name);
-		check_for_unused(exp->export, basename, exp->name);
 	}
 }
 
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index 834220de002b..0c47ff95c0e2 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -139,9 +139,7 @@ struct elf_info {
 	Elf_Sym      *symtab_start;
 	Elf_Sym      *symtab_stop;
 	Elf_Section  export_sec;
-	Elf_Section  export_unused_sec;
 	Elf_Section  export_gpl_sec;
-	Elf_Section  export_unused_gpl_sec;
 	char         *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index d82b452e8a71..24e8af579ce3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -11,12 +11,8 @@ SECTIONS {
 
 	__ksymtab		0 : { *(SORT(___ksymtab+*)) }
 	__ksymtab_gpl		0 : { *(SORT(___ksymtab_gpl+*)) }
-	__ksymtab_unused	0 : { *(SORT(___ksymtab_unused+*)) }
-	__ksymtab_unused_gpl	0 : { *(SORT(___ksymtab_unused_gpl+*)) }
 	__kcrctab		0 : { *(SORT(___kcrctab+*)) }
 	__kcrctab_gpl		0 : { *(SORT(___kcrctab_gpl+*)) }
-	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
-	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
 
 	.init_array		0 : ALIGN(8) { *(SORT(.init_array.*)) *(.init_array) }
 
diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h
index 9f61349a8944..acb6f4daa2f0 100644
--- a/tools/include/linux/export.h
+++ b/tools/include/linux/export.h
@@ -3,7 +3,5 @@
 
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
-#define EXPORT_UNUSED_SYMBOL(sym)
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
 #endif
-- 
cgit v1.2.3


From 866eef48d80234e1ea3a2f78b54afc563be3ea4a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 5 Feb 2021 17:10:18 +0000
Subject: gfs2: Add trusted xattr support

Add support for an additional filesystem version (sb_fs_format = 1802).
When a filesystem with the new version is mounted, the filesystem
supports "trusted.*" xattrs.

In addition, version 1802 filesystems implement a form of forward
compatibility for xattrs: when xattrs with an unknown prefix (ea_type)
are found on a version 1802 filesystem, those attributes are not shown
by listxattr, and they are not accessible by getxattr, setxattr, or
removexattr.

This mechanism might turn out to be what we need in the future, but if
not, we can always bump the filesystem version and break compatibility
instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/ops_fstype.c             | 14 +++++++++++-
 fs/gfs2/super.h                  |  4 +++-
 fs/gfs2/xattr.c                  | 48 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/gfs2_ondisk.h |  5 +++--
 4 files changed, 63 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6537e94f4d8..d89764856f21 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -489,6 +489,19 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 		goto out;
 	}
 
+	switch(sdp->sd_sb.sb_fs_format) {
+	case GFS2_FS_FORMAT_MAX:
+		sb->s_xattr = gfs2_xattr_handlers_max;
+		break;
+
+	case GFS2_FS_FORMAT_MIN:
+		sb->s_xattr = gfs2_xattr_handlers_min;
+		break;
+
+	default:
+		BUG();
+	}
+
 	/* Set up the buffer cache and SB for real */
 	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
@@ -1110,7 +1123,6 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &gfs2_super_ops;
 	sb->s_d_op = &gfs2_dops;
 	sb->s_export_op = &gfs2_export_ops;
-	sb->s_xattr = gfs2_xattr_handlers;
 	sb->s_qcop = &gfs2_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 977079693bdc..08e502dec7ec 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -58,7 +58,9 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
-extern const struct xattr_handler *gfs2_xattr_handlers[];
+
+extern const struct xattr_handler *gfs2_xattr_handlers_max[];
+extern const struct xattr_handler **gfs2_xattr_handlers_min;
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 9d7667bc4292..2b520ac49d5f 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -70,6 +70,20 @@ static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
 	return 0;
 }
 
+static bool gfs2_eatype_valid(struct gfs2_sbd *sdp, u8 type)
+{
+	switch(sdp->sd_sb.sb_fs_format) {
+	case GFS2_FS_FORMAT_MAX:
+		return true;
+
+	case GFS2_FS_FORMAT_MIN:
+		return type <= GFS2_EATYPE_SECURITY;
+
+	default:
+		return false;
+	}
+}
+
 typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
 			  struct gfs2_ea_header *ea,
 			  struct gfs2_ea_header *prev, void *private);
@@ -77,6 +91,7 @@ typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
 static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
 			ea_call_t ea_call, void *data)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_header *ea, *prev = NULL;
 	int error = 0;
 
@@ -89,9 +104,8 @@ static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
 						  bh->b_data + bh->b_size))
 			goto fail;
-		if (!GFS2_EATYPE_VALID(ea->ea_type))
+		if (!gfs2_eatype_valid(sdp, ea->ea_type))
 			goto fail;
-
 		error = ea_call(ip, bh, ea, prev, data);
 		if (error)
 			return error;
@@ -344,6 +358,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
 		     void *private)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct ea_list *ei = private;
 	struct gfs2_ea_request *er = ei->ei_er;
 	unsigned int ea_size;
@@ -353,6 +368,8 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (ea->ea_type == GFS2_EATYPE_UNUSED)
 		return 0;
 
+	BUG_ON(ea->ea_type > GFS2_EATYPE_SECURITY &&
+	       sdp->sd_sb.sb_fs_format == GFS2_FS_FORMAT_MIN);
 	switch (ea->ea_type) {
 	case GFS2_EATYPE_USR:
 		prefix = "user.";
@@ -366,8 +383,12 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		prefix = "security.";
 		l = 9;
 		break;
+	case GFS2_EATYPE_TRUSTED:
+		prefix = "trusted.";
+		l = 8;
+		break;
 	default:
-		BUG();
+		return 0;
 	}
 
 	ea_size = l + ea->ea_name_len + 1;
@@ -1463,7 +1484,25 @@ static const struct xattr_handler gfs2_xattr_security_handler = {
 	.set    = gfs2_xattr_set,
 };
 
-const struct xattr_handler *gfs2_xattr_handlers[] = {
+static bool
+gfs2_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler gfs2_xattr_trusted_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.flags  = GFS2_EATYPE_TRUSTED,
+	.list	= gfs2_xattr_trusted_list,
+	.get    = gfs2_xattr_get,
+	.set    = gfs2_xattr_set,
+};
+
+const struct xattr_handler *gfs2_xattr_handlers_max[] = {
+	/* GFS2_FS_FORMAT_MAX */
+	&gfs2_xattr_trusted_handler,
+
+	/* GFS2_FS_FORMAT_MIN */
 	&gfs2_xattr_user_handler,
 	&gfs2_xattr_security_handler,
 	&posix_acl_access_xattr_handler,
@@ -1471,3 +1510,4 @@ const struct xattr_handler *gfs2_xattr_handlers[] = {
 	NULL,
 };
 
+const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 07e508e6691b..6ec4291bcc7a 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -47,7 +47,7 @@
 #define GFS2_FORMAT_DE		1200
 #define GFS2_FORMAT_QU		1500
 /* These are part of the superblock */
-#define GFS2_FORMAT_FS		1801
+#define GFS2_FORMAT_FS		1802
 #define GFS2_FORMAT_MULTI	1900
 
 /*
@@ -389,8 +389,9 @@ struct gfs2_leaf {
 #define GFS2_EATYPE_USR		1
 #define GFS2_EATYPE_SYS		2
 #define GFS2_EATYPE_SECURITY	3
+#define GFS2_EATYPE_TRUSTED	4
 
-#define GFS2_EATYPE_LAST	3
+#define GFS2_EATYPE_LAST	4
 #define GFS2_EATYPE_VALID(x)	((x) <= GFS2_EATYPE_LAST)
 
 #define GFS2_EAFLAG_LAST	0x01	/* last ea in block */
-- 
cgit v1.2.3


From e52606d2f5363f4900cfe8419e391644b0229c6f Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Wed, 27 Jan 2021 16:34:37 +0200
Subject: habanalabs: support fetching first available user CQ

User must be aware of the available CQs when it needs to use them.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h       | 2 ++
 drivers/misc/habanalabs/common/habanalabs_ioctl.c | 3 ++-
 drivers/misc/habanalabs/gaudi/gaudi.c             | 3 +++
 drivers/misc/habanalabs/goya/goya.c               | 3 +++
 include/uapi/misc/habanalabs.h                    | 3 +++
 5 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 3c54010f7ab9..98163317ec43 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -411,6 +411,7 @@ struct hl_mmu_properties {
  * @first_available_user_mon: first monitor available for the user
  * @first_available_user_msix_interrupt: first available msix interrupt
  *                                       reserved for the user
+ * @first_available_cq: first available CQ for the user.
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
  * @fw_security_disabled: true if security measures are disabled in firmware,
@@ -475,6 +476,7 @@ struct asic_fixed_properties {
 	u16				first_available_user_sob[HL_MAX_DCORES];
 	u16				first_available_user_mon[HL_MAX_DCORES];
 	u16				first_available_user_msix_interrupt;
+	u16				first_available_cq[HL_MAX_DCORES];
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 	u8				fw_security_disabled;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index e86f46d4b613..03af61cecd37 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -397,7 +397,8 @@ static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 			prop->first_available_user_sob[args->dcore_id];
 	sm_info.first_available_monitor =
 			prop->first_available_user_mon[args->dcore_id];
-
+	sm_info.first_available_cq =
+			prop->first_available_cq[args->dcore_id];
 
 	return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
 			sizeof(sm_info))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 006c34ae35c2..8fc0de3cf3a9 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -529,6 +529,9 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
 
 	prop->first_available_user_msix_interrupt = USHRT_MAX;
 
+	for (i = 0 ; i < HL_MAX_DCORES ; i++)
+		prop->first_available_cq[i] = USHRT_MAX;
+
 	/* disable fw security for now, set it in a later stage */
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 53db7e966866..d26b405f0c17 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -457,6 +457,9 @@ int goya_get_fixed_properties(struct hl_device *hdev)
 
 	prop->first_available_user_msix_interrupt = USHRT_MAX;
 
+	for (i = 0 ; i < HL_MAX_DCORES ; i++)
+		prop->first_available_cq[i] = USHRT_MAX;
+
 	/* disable fw security for now, set it in a later stage */
 	prop->fw_security_disabled = true;
 	prop->fw_security_status_valid = false;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index ebde42b37b43..64ae83b5f8e5 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -414,10 +414,13 @@ struct hl_pll_frequency_info {
  * struct hl_info_sync_manager - sync manager information
  * @first_available_sync_object: first available sob
  * @first_available_monitor: first available monitor
+ * @first_available_cq: first available cq
  */
 struct hl_info_sync_manager {
 	__u32 first_available_sync_object;
 	__u32 first_available_monitor;
+	__u32 first_available_cq;
+	__u32 reserved;
 };
 
 /**
-- 
cgit v1.2.3


From 6df50d274363aa189a31435024339b781a6e32a9 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Fri, 5 Feb 2021 16:04:34 +0200
Subject: habanalabs: return block size + block ID

When user gives us a block address to get its ID to mmap it, he also
needs to get from us the block size to pass to the driver in the mmap
function.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs.h |  4 +++-
 drivers/misc/habanalabs/common/memory.c     | 19 +++++++++++--------
 drivers/misc/habanalabs/gaudi/gaudi.c       |  2 +-
 drivers/misc/habanalabs/goya/goya.c         |  2 +-
 include/uapi/misc/habanalabs.h              | 27 +++++++++++++++++++++------
 5 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 46c37b0c704a..dc9f5a83dfc9 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -862,6 +862,8 @@ enum div_select_defs {
  *                   showing it to users.
  * @ack_protection_bits_errors: ack and dump all security violations
  * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it.
+ *                   also returns the size of the block if caller supplies
+ *                   a valid pointer for it
  * @hw_block_mmap: mmap a HW block with a given id.
  * @enable_events_from_fw: send interrupt to firmware to notify them the
  *                         driver is ready to receive asynchronous events. This
@@ -980,7 +982,7 @@ struct hl_asic_funcs {
 	u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
 	void (*ack_protection_bits_errors)(struct hl_device *hdev);
 	int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr,
-			u32 *block_id);
+				u32 *block_size, u32 *block_id);
 	int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
 			u32 block_id, u32 block_size);
 	void (*enable_events_from_fw)(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 7171e8820a2d..7cadf75ebb81 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -1289,12 +1289,13 @@ vm_type_err:
 	return rc;
 }
 
-static int map_block(struct hl_device *hdev, u64 address, u64 *handle)
+static int map_block(struct hl_device *hdev, u64 address, u64 *handle,
+			u32 *size)
 {
 	u32 block_id = 0;
 	int rc;
 
-	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, &block_id);
+	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);
 
 	*handle = block_id | HL_MMAP_TYPE_BLOCK;
 	*handle <<= PAGE_SHIFT;
@@ -1371,7 +1372,7 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
 	u64 block_handle, device_addr = 0;
-	u32 handle = 0;
+	u32 handle = 0, block_size;
 	int rc;
 
 	switch (args->in.op) {
@@ -1416,8 +1417,9 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 
 	case HL_MEM_OP_MAP_BLOCK:
 		rc = map_block(hdev, args->in.map_block.block_addr,
-							&block_handle);
-		args->out.handle = block_handle;
+				&block_handle, &block_size);
+		args->out.block_handle = block_handle;
+		args->out.block_size = block_size;
 		break;
 
 	default:
@@ -1437,7 +1439,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
 	u64 block_handle, device_addr = 0;
-	u32 handle = 0;
+	u32 handle = 0, block_size;
 	int rc;
 
 	if (!hl_device_operational(hdev, &status)) {
@@ -1524,8 +1526,9 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	case HL_MEM_OP_MAP_BLOCK:
 		rc = map_block(hdev, args->in.map_block.block_addr,
-							&block_handle);
-		args->out.handle = block_handle;
+				&block_handle, &block_size);
+		args->out.block_handle = block_handle;
+		args->out.block_size = block_size;
 		break;
 
 	default:
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f937d90786b2..35342edd4a02 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -8490,7 +8490,7 @@ static u64 gaudi_get_device_time(struct hl_device *hdev)
 }
 
 static int gaudi_get_hw_block_id(struct hl_device *hdev, u64 block_addr,
-					u32 *block_id)
+				u32 *block_size, u32 *block_id)
 {
 	return -EPERM;
 }
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index c30524660761..ed566c52ccaa 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5390,7 +5390,7 @@ static void goya_ctx_fini(struct hl_ctx *ctx)
 }
 
 static int goya_get_hw_block_id(struct hl_device *hdev, u64 block_addr,
-				u32 *block_id)
+			u32 *block_size, u32 *block_id)
 {
 	return -EPERM;
 }
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 64ae83b5f8e5..5a86b521a450 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -782,10 +782,10 @@ struct hl_mem_in {
 		/* HL_MEM_OP_MAP_BLOCK - map a hw block */
 		struct {
 			/*
-			 * HW block address to map, a handle will be returned
-			 * to the user and will be used to mmap the relevant
-			 * block. Only addresses from configuration space are
-			 * allowed.
+			 * HW block address to map, a handle and size will be
+			 * returned to the user and will be used to mmap the
+			 * relevant block. Only addresses from configuration
+			 * space are allowed.
 			 */
 			__u64 block_addr;
 		} map_block;
@@ -816,11 +816,26 @@ struct hl_mem_out {
 		__u64 device_virt_addr;
 
 		/*
-		 * Used for HL_MEM_OP_ALLOC and HL_MEM_OP_MAP_BLOCK.
+		 * Used in HL_MEM_OP_ALLOC
 		 * This is the assigned handle for the allocated memory
-		 * or mapped block
 		 */
 		__u64 handle;
+
+		struct {
+			/*
+			 * Used in HL_MEM_OP_MAP_BLOCK.
+			 * This is the assigned handle for the mapped block
+			 */
+			__u64 block_handle;
+
+			/*
+			 * Used in HL_MEM_OP_MAP_BLOCK
+			 * This is the size of the mapped block
+			 */
+			__u32 block_size;
+
+			__u32 pad;
+		};
 	};
 };
 
-- 
cgit v1.2.3


From 28aa2f9e73e762dbaa28fdca20cccb59c74cc139 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 8 Feb 2021 08:55:47 -0500
Subject: NFS: Always clear an invalid mapping when attempting a buffered write

If the page cache is invalid, then we can't do read-modify-write, so
ensure that we do clear it when we know it is invalid.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/file.c          |  2 ++
 fs/nfs/inode.c         | 97 ++++++++++++++++++++++++++++----------------------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 57 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 02795a01c7ef..03fd1dcc96bd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -632,6 +632,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 			goto out;
 	}
 
+	nfs_clear_invalid_mapping(file->f_mapping);
+
 	since = filemap_sample_wb_err(file->f_mapping);
 	nfs_start_io_write(inode);
 	result = generic_write_checks(iocb, from);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 522aa10a1a3e..2533053d764a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1257,55 +1257,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	return 0;
 }
 
-bool nfs_mapping_need_revalidate_inode(struct inode *inode)
-{
-	return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
-		NFS_STALE(inode);
-}
-
-int nfs_revalidate_mapping_rcu(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long *bitlock = &nfsi->flags;
-	int ret = 0;
-
-	if (IS_SWAPFILE(inode))
-		goto out;
-	if (nfs_mapping_need_revalidate_inode(inode)) {
-		ret = -ECHILD;
-		goto out;
-	}
-	spin_lock(&inode->i_lock);
-	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
-	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
-		ret = -ECHILD;
-	spin_unlock(&inode->i_lock);
-out:
-	return ret;
-}
-
 /**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode: pointer to host inode
+ * nfs_clear_invalid_mapping - Conditionally clear a mapping
  * @mapping: pointer to mapping
+ *
+ * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping.
  */
-int nfs_revalidate_mapping(struct inode *inode,
-		struct address_space *mapping)
+int nfs_clear_invalid_mapping(struct address_space *mapping)
 {
+	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long *bitlock = &nfsi->flags;
 	int ret = 0;
 
-	/* swapfiles are not supposed to be shared. */
-	if (IS_SWAPFILE(inode))
-		goto out;
-
-	if (nfs_mapping_need_revalidate_inode(inode)) {
-		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-		if (ret < 0)
-			goto out;
-	}
-
 	/*
 	 * We must clear NFS_INO_INVALID_DATA first to ensure that
 	 * invalidations that come in while we're shooting down the mappings
@@ -1336,8 +1300,8 @@ int nfs_revalidate_mapping(struct inode *inode,
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
-			NFS_INO_DATA_INVAL_DEFER);
+	nfsi->cache_validity &=
+		~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1350,6 +1314,53 @@ out:
 	return ret;
 }
 
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+	return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+		NFS_STALE(inode);
+}
+
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
+	int ret = 0;
+
+	if (IS_SWAPFILE(inode))
+		goto out;
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		ret = -ECHILD;
+		goto out;
+	}
+	spin_lock(&inode->i_lock);
+	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+		ret = -ECHILD;
+	spin_unlock(&inode->i_lock);
+out:
+	return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	/* swapfiles are not supposed to be shared. */
+	if (IS_SWAPFILE(inode))
+		return 0;
+
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return nfs_clear_invalid_mapping(mapping);
+}
+
 static bool nfs_file_has_writers(struct nfs_inode *nfsi)
 {
 	struct inode *inode = &nfsi->vfs_inode;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3cfcf219e96b..2c662857247a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -387,6 +387,7 @@ extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
+extern int nfs_clear_invalid_mapping(struct address_space *mapping);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
-- 
cgit v1.2.3


From 5923d64b7ab63dcc6f0df946098f50902f9540d1 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 6 Feb 2021 22:46:01 -0600
Subject: scsi: libiscsi: Drop taskqueuelock

The purpose of the taskqueuelock was to handle the issue where a bad target
decides to send a R2T and before its data has been sent decides to send a
cmd response to complete the cmd. The following patches fix up the
frwd/back locks so they are taken from the queue/xmit (frwd) and completion
(back) paths again. To get there this patch removes the taskqueuelock which
for iSCSI xmit wq based drivers was taken in the queue, xmit and completion
paths.

Instead of the lock, we just make sure we have a ref to the task when we
queue a R2T, and then we always remove the task from the requeue list in
the xmit path or the forced cleanup paths.

Link: https://lore.kernel.org/r/20210207044608.27585-3-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libiscsi.c     | 178 ++++++++++++++++++++++++++++----------------
 drivers/scsi/libiscsi_tcp.c |  86 +++++++++++++--------
 include/scsi/libiscsi.h     |   4 +-
 3 files changed, 172 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index cee1dbaa1b93..3d74fdd9f31a 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -523,16 +523,6 @@ static void iscsi_complete_task(struct iscsi_task *task, int state)
 	WARN_ON_ONCE(task->state == ISCSI_TASK_FREE);
 	task->state = state;
 
-	spin_lock_bh(&conn->taskqueuelock);
-	if (!list_empty(&task->running)) {
-		pr_debug_once("%s while task on list", __func__);
-		list_del_init(&task->running);
-	}
-	spin_unlock_bh(&conn->taskqueuelock);
-
-	if (conn->task == task)
-		conn->task = NULL;
-
 	if (READ_ONCE(conn->ping_task) == task)
 		WRITE_ONCE(conn->ping_task, NULL);
 
@@ -564,9 +554,40 @@ void iscsi_complete_scsi_task(struct iscsi_task *task,
 }
 EXPORT_SYMBOL_GPL(iscsi_complete_scsi_task);
 
+/*
+ * Must be called with back and frwd lock
+ */
+static bool cleanup_queued_task(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	bool early_complete = false;
+
+	/* Bad target might have completed task while it was still running */
+	if (task->state == ISCSI_TASK_COMPLETED)
+		early_complete = true;
+
+	if (!list_empty(&task->running)) {
+		list_del_init(&task->running);
+		/*
+		 * If it's on a list but still running, this could be from
+		 * a bad target sending a rsp early, cleanup from a TMF, or
+		 * session recovery.
+		 */
+		if (task->state == ISCSI_TASK_RUNNING ||
+		    task->state == ISCSI_TASK_COMPLETED)
+			__iscsi_put_task(task);
+	}
+
+	if (conn->task == task) {
+		conn->task = NULL;
+		__iscsi_put_task(task);
+	}
+
+	return early_complete;
+}
 
 /*
- * session back_lock must be held and if not called for a task that is
+ * session frwd_lock must be held and if not called for a task that is
  * still pending or from the xmit thread, then xmit thread must
  * be suspended.
  */
@@ -585,6 +606,13 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
 	if (!sc)
 		return;
 
+	/* regular RX path uses back_lock */
+	spin_lock_bh(&conn->session->back_lock);
+	if (cleanup_queued_task(task)) {
+		spin_unlock_bh(&conn->session->back_lock);
+		return;
+	}
+
 	if (task->state == ISCSI_TASK_PENDING) {
 		/*
 		 * cmd never made it to the xmit thread, so we should not count
@@ -600,9 +628,6 @@ static void fail_scsi_task(struct iscsi_task *task, int err)
 
 	sc->result = err << 16;
 	scsi_set_resid(sc, scsi_bufflen(sc));
-
-	/* regular RX path uses back_lock */
-	spin_lock_bh(&conn->session->back_lock);
 	iscsi_complete_task(task, state);
 	spin_unlock_bh(&conn->session->back_lock);
 }
@@ -748,9 +773,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 		if (session->tt->xmit_task(task))
 			goto free_task;
 	} else {
-		spin_lock_bh(&conn->taskqueuelock);
 		list_add_tail(&task->running, &conn->mgmtqueue);
-		spin_unlock_bh(&conn->taskqueuelock);
 		iscsi_conn_queue_work(conn);
 	}
 
@@ -1411,31 +1434,61 @@ static int iscsi_check_cmdsn_window_closed(struct iscsi_conn *conn)
 	return 0;
 }
 
-static int iscsi_xmit_task(struct iscsi_conn *conn)
+static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task,
+			   bool was_requeue)
 {
-	struct iscsi_task *task = conn->task;
 	int rc;
 
-	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx))
-		return -ENODATA;
-
 	spin_lock_bh(&conn->session->back_lock);
-	if (conn->task == NULL) {
+
+	if (!conn->task) {
+		/* Take a ref so we can access it after xmit_task() */
+		__iscsi_get_task(task);
+	} else {
+		/* Already have a ref from when we failed to send it last call */
+		conn->task = NULL;
+	}
+
+	/*
+	 * If this was a requeue for a R2T we have an extra ref on the task in
+	 * case a bad target sends a cmd rsp before we have handled the task.
+	 */
+	if (was_requeue)
+		__iscsi_put_task(task);
+
+	/*
+	 * Do this after dropping the extra ref because if this was a requeue
+	 * it's removed from that list and cleanup_queued_task would miss it.
+	 */
+	if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) {
+		/*
+		 * Save the task and ref in case we weren't cleaning up this
+		 * task and get woken up again.
+		 */
+		conn->task = task;
 		spin_unlock_bh(&conn->session->back_lock);
 		return -ENODATA;
 	}
-	__iscsi_get_task(task);
 	spin_unlock_bh(&conn->session->back_lock);
+
 	spin_unlock_bh(&conn->session->frwd_lock);
 	rc = conn->session->tt->xmit_task(task);
 	spin_lock_bh(&conn->session->frwd_lock);
 	if (!rc) {
 		/* done with this task */
 		task->last_xfer = jiffies;
-		conn->task = NULL;
 	}
 	/* regular RX path uses back_lock */
 	spin_lock(&conn->session->back_lock);
+	if (rc && task->state == ISCSI_TASK_RUNNING) {
+		/*
+		 * get an extra ref that is released next time we access it
+		 * as conn->task above.
+		 */
+		__iscsi_get_task(task);
+		conn->task = task;
+	}
+
 	__iscsi_put_task(task);
 	spin_unlock(&conn->session->back_lock);
 	return rc;
@@ -1445,9 +1498,7 @@ static int iscsi_xmit_task(struct iscsi_conn *conn)
  * iscsi_requeue_task - requeue task to run from session workqueue
  * @task: task to requeue
  *
- * LLDs that need to run a task from the session workqueue should call
- * this. The session frwd_lock must be held. This should only be called
- * by software drivers.
+ * Callers must have taken a ref to the task that is going to be requeued.
  */
 void iscsi_requeue_task(struct iscsi_task *task)
 {
@@ -1457,11 +1508,18 @@ void iscsi_requeue_task(struct iscsi_task *task)
 	 * this may be on the requeue list already if the xmit_task callout
 	 * is handling the r2ts while we are adding new ones
 	 */
-	spin_lock_bh(&conn->taskqueuelock);
-	if (list_empty(&task->running))
+	spin_lock_bh(&conn->session->frwd_lock);
+	if (list_empty(&task->running)) {
 		list_add_tail(&task->running, &conn->requeue);
-	spin_unlock_bh(&conn->taskqueuelock);
+	} else {
+		/*
+		 * Don't need the extra ref since it's already requeued and
+		 * has a ref.
+		 */
+		iscsi_put_task(task);
+	}
 	iscsi_conn_queue_work(conn);
+	spin_unlock_bh(&conn->session->frwd_lock);
 }
 EXPORT_SYMBOL_GPL(iscsi_requeue_task);
 
@@ -1487,7 +1545,7 @@ static int iscsi_data_xmit(struct iscsi_conn *conn)
 	}
 
 	if (conn->task) {
-		rc = iscsi_xmit_task(conn);
+		rc = iscsi_xmit_task(conn, conn->task, false);
 	        if (rc)
 		        goto done;
 	}
@@ -1497,49 +1555,41 @@ static int iscsi_data_xmit(struct iscsi_conn *conn)
 	 * only have one nop-out as a ping from us and targets should not
 	 * overflow us with nop-ins
 	 */
-	spin_lock_bh(&conn->taskqueuelock);
 check_mgmt:
 	while (!list_empty(&conn->mgmtqueue)) {
-		conn->task = list_entry(conn->mgmtqueue.next,
-					 struct iscsi_task, running);
-		list_del_init(&conn->task->running);
-		spin_unlock_bh(&conn->taskqueuelock);
-		if (iscsi_prep_mgmt_task(conn, conn->task)) {
+		task = list_entry(conn->mgmtqueue.next, struct iscsi_task,
+				  running);
+		list_del_init(&task->running);
+		if (iscsi_prep_mgmt_task(conn, task)) {
 			/* regular RX path uses back_lock */
 			spin_lock_bh(&conn->session->back_lock);
-			__iscsi_put_task(conn->task);
+			__iscsi_put_task(task);
 			spin_unlock_bh(&conn->session->back_lock);
-			conn->task = NULL;
-			spin_lock_bh(&conn->taskqueuelock);
 			continue;
 		}
-		rc = iscsi_xmit_task(conn);
+		rc = iscsi_xmit_task(conn, task, false);
 		if (rc)
 			goto done;
-		spin_lock_bh(&conn->taskqueuelock);
 	}
 
 	/* process pending command queue */
 	while (!list_empty(&conn->cmdqueue)) {
-		conn->task = list_entry(conn->cmdqueue.next, struct iscsi_task,
-					running);
-		list_del_init(&conn->task->running);
-		spin_unlock_bh(&conn->taskqueuelock);
+		task = list_entry(conn->cmdqueue.next, struct iscsi_task,
+				  running);
+		list_del_init(&task->running);
 		if (conn->session->state == ISCSI_STATE_LOGGING_OUT) {
-			fail_scsi_task(conn->task, DID_IMM_RETRY);
-			spin_lock_bh(&conn->taskqueuelock);
+			fail_scsi_task(task, DID_IMM_RETRY);
 			continue;
 		}
-		rc = iscsi_prep_scsi_cmd_pdu(conn->task);
+		rc = iscsi_prep_scsi_cmd_pdu(task);
 		if (rc) {
 			if (rc == -ENOMEM || rc == -EACCES)
-				fail_scsi_task(conn->task, DID_IMM_RETRY);
+				fail_scsi_task(task, DID_IMM_RETRY);
 			else
-				fail_scsi_task(conn->task, DID_ABORT);
-			spin_lock_bh(&conn->taskqueuelock);
+				fail_scsi_task(task, DID_ABORT);
 			continue;
 		}
-		rc = iscsi_xmit_task(conn);
+		rc = iscsi_xmit_task(conn, task, false);
 		if (rc)
 			goto done;
 		/*
@@ -1547,7 +1597,6 @@ check_mgmt:
 		 * we need to check the mgmt queue for nops that need to
 		 * be sent to aviod starvation
 		 */
-		spin_lock_bh(&conn->taskqueuelock);
 		if (!list_empty(&conn->mgmtqueue))
 			goto check_mgmt;
 	}
@@ -1561,21 +1610,17 @@ check_mgmt:
 
 		task = list_entry(conn->requeue.next, struct iscsi_task,
 				  running);
+
 		if (iscsi_check_tmf_restrictions(task, ISCSI_OP_SCSI_DATA_OUT))
 			break;
 
-		conn->task = task;
-		list_del_init(&conn->task->running);
-		conn->task->state = ISCSI_TASK_RUNNING;
-		spin_unlock_bh(&conn->taskqueuelock);
-		rc = iscsi_xmit_task(conn);
+		list_del_init(&task->running);
+		rc = iscsi_xmit_task(conn, task, true);
 		if (rc)
 			goto done;
-		spin_lock_bh(&conn->taskqueuelock);
 		if (!list_empty(&conn->mgmtqueue))
 			goto check_mgmt;
 	}
-	spin_unlock_bh(&conn->taskqueuelock);
 	spin_unlock_bh(&conn->session->frwd_lock);
 	return -ENODATA;
 
@@ -1741,9 +1786,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 			goto prepd_reject;
 		}
 	} else {
-		spin_lock_bh(&conn->taskqueuelock);
 		list_add_tail(&task->running, &conn->cmdqueue);
-		spin_unlock_bh(&conn->taskqueuelock);
 		iscsi_conn_queue_work(conn);
 	}
 
@@ -2914,7 +2957,6 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 	INIT_LIST_HEAD(&conn->mgmtqueue);
 	INIT_LIST_HEAD(&conn->cmdqueue);
 	INIT_LIST_HEAD(&conn->requeue);
-	spin_lock_init(&conn->taskqueuelock);
 	INIT_WORK(&conn->xmitwork, iscsi_xmitworker);
 
 	/* allocate login_task used for the login/text sequences */
@@ -3080,10 +3122,16 @@ fail_mgmt_tasks(struct iscsi_session *session, struct iscsi_conn *conn)
 		ISCSI_DBG_SESSION(conn->session,
 				  "failing mgmt itt 0x%x state %d\n",
 				  task->itt, task->state);
+
+		spin_lock_bh(&session->back_lock);
+		if (cleanup_queued_task(task)) {
+			spin_unlock_bh(&session->back_lock);
+			continue;
+		}
+
 		state = ISCSI_TASK_ABRT_SESS_RECOV;
 		if (task->state == ISCSI_TASK_PENDING)
 			state = ISCSI_TASK_COMPLETED;
-		spin_lock_bh(&session->back_lock);
 		iscsi_complete_task(task, state);
 		spin_unlock_bh(&session->back_lock);
 	}
diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
index 83f14b2c8804..2e9ffe3d1a55 100644
--- a/drivers/scsi/libiscsi_tcp.c
+++ b/drivers/scsi/libiscsi_tcp.c
@@ -524,48 +524,79 @@ static int iscsi_tcp_data_in(struct iscsi_conn *conn, struct iscsi_task *task)
 /**
  * iscsi_tcp_r2t_rsp - iSCSI R2T Response processing
  * @conn: iscsi connection
- * @task: scsi command task
+ * @hdr: PDU header
  */
-static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
+static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 {
 	struct iscsi_session *session = conn->session;
-	struct iscsi_tcp_task *tcp_task = task->dd_data;
-	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-	struct iscsi_r2t_rsp *rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
+	struct iscsi_tcp_task *tcp_task;
+	struct iscsi_tcp_conn *tcp_conn;
+	struct iscsi_r2t_rsp *rhdr;
 	struct iscsi_r2t_info *r2t;
-	int r2tsn = be32_to_cpu(rhdr->r2tsn);
+	struct iscsi_task *task;
 	u32 data_length;
 	u32 data_offset;
+	int r2tsn;
 	int rc;
 
+	spin_lock(&session->back_lock);
+	task = iscsi_itt_to_ctask(conn, hdr->itt);
+	if (!task) {
+		spin_unlock(&session->back_lock);
+		return ISCSI_ERR_BAD_ITT;
+	} else if (task->sc->sc_data_direction != DMA_TO_DEVICE) {
+		spin_unlock(&session->back_lock);
+		return ISCSI_ERR_PROTO;
+	}
+	/*
+	 * A bad target might complete the cmd before we have handled R2Ts
+	 * so get a ref to the task that will be dropped in the xmit path.
+	 */
+	if (task->state != ISCSI_TASK_RUNNING) {
+		spin_unlock(&session->back_lock);
+		/* Let the path that got the early rsp complete it */
+		return 0;
+	}
+	task->last_xfer = jiffies;
+	__iscsi_get_task(task);
+
+	tcp_conn = conn->dd_data;
+	rhdr = (struct iscsi_r2t_rsp *)tcp_conn->in.hdr;
+	/* fill-in new R2T associated with the task */
+	iscsi_update_cmdsn(session, (struct iscsi_nopin *)rhdr);
+	spin_unlock(&session->back_lock);
+
 	if (tcp_conn->in.datalen) {
 		iscsi_conn_printk(KERN_ERR, conn,
 				  "invalid R2t with datalen %d\n",
 				  tcp_conn->in.datalen);
-		return ISCSI_ERR_DATALEN;
+		rc = ISCSI_ERR_DATALEN;
+		goto put_task;
 	}
 
+	tcp_task = task->dd_data;
+	r2tsn = be32_to_cpu(rhdr->r2tsn);
 	if (tcp_task->exp_datasn != r2tsn){
 		ISCSI_DBG_TCP(conn, "task->exp_datasn(%d) != rhdr->r2tsn(%d)\n",
 			      tcp_task->exp_datasn, r2tsn);
-		return ISCSI_ERR_R2TSN;
+		rc = ISCSI_ERR_R2TSN;
+		goto put_task;
 	}
 
-	/* fill-in new R2T associated with the task */
-	iscsi_update_cmdsn(session, (struct iscsi_nopin*)rhdr);
-
-	if (!task->sc || session->state != ISCSI_STATE_LOGGED_IN) {
+	if (session->state != ISCSI_STATE_LOGGED_IN) {
 		iscsi_conn_printk(KERN_INFO, conn,
 				  "dropping R2T itt %d in recovery.\n",
 				  task->itt);
-		return 0;
+		rc = 0;
+		goto put_task;
 	}
 
 	data_length = be32_to_cpu(rhdr->data_length);
 	if (data_length == 0) {
 		iscsi_conn_printk(KERN_ERR, conn,
 				  "invalid R2T with zero data len\n");
-		return ISCSI_ERR_DATALEN;
+		rc = ISCSI_ERR_DATALEN;
+		goto put_task;
 	}
 
 	if (data_length > session->max_burst)
@@ -579,7 +610,8 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 				  "invalid R2T with data len %u at offset %u "
 				  "and total length %d\n", data_length,
 				  data_offset, task->sc->sdb.length);
-		return ISCSI_ERR_DATALEN;
+		rc = ISCSI_ERR_DATALEN;
+		goto put_task;
 	}
 
 	spin_lock(&tcp_task->pool2queue);
@@ -589,7 +621,8 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 				  "Target has sent more R2Ts than it "
 				  "negotiated for or driver has leaked.\n");
 		spin_unlock(&tcp_task->pool2queue);
-		return ISCSI_ERR_PROTO;
+		rc = ISCSI_ERR_PROTO;
+		goto put_task;
 	}
 
 	r2t->exp_statsn = rhdr->statsn;
@@ -607,6 +640,10 @@ static int iscsi_tcp_r2t_rsp(struct iscsi_conn *conn, struct iscsi_task *task)
 
 	iscsi_requeue_task(task);
 	return 0;
+
+put_task:
+	iscsi_put_task(task);
+	return rc;
 }
 
 /*
@@ -730,20 +767,11 @@ iscsi_tcp_hdr_dissect(struct iscsi_conn *conn, struct iscsi_hdr *hdr)
 		rc = iscsi_complete_pdu(conn, hdr, NULL, 0);
 		break;
 	case ISCSI_OP_R2T:
-		spin_lock(&conn->session->back_lock);
-		task = iscsi_itt_to_ctask(conn, hdr->itt);
-		spin_unlock(&conn->session->back_lock);
-		if (!task)
-			rc = ISCSI_ERR_BAD_ITT;
-		else if (ahslen)
+		if (ahslen) {
 			rc = ISCSI_ERR_AHSLEN;
-		else if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
-			task->last_xfer = jiffies;
-			spin_lock(&conn->session->frwd_lock);
-			rc = iscsi_tcp_r2t_rsp(conn, task);
-			spin_unlock(&conn->session->frwd_lock);
-		} else
-			rc = ISCSI_ERR_PROTO;
+			break;
+		}
+		rc = iscsi_tcp_r2t_rsp(conn, hdr);
 		break;
 	case ISCSI_OP_LOGIN_RSP:
 	case ISCSI_OP_TEXT_RSP:
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index b3bbd10eb3f0..44a9554aea62 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -187,7 +187,7 @@ struct iscsi_conn {
 	struct iscsi_task	*task;		/* xmit task in progress */
 
 	/* xmit */
-	spinlock_t		taskqueuelock;  /* protects the next three lists */
+	/* items must be added/deleted under frwd lock */
 	struct list_head	mgmtqueue;	/* mgmt (control) xmit queue */
 	struct list_head	cmdqueue;	/* data-path cmd queue */
 	struct list_head	requeue;	/* tasks needing another run */
@@ -332,7 +332,7 @@ struct iscsi_session {
 						 * cmdsn, queued_cmdsn     *
 						 * session resources:      *
 						 * - cmdpool kfifo_out ,   *
-						 * - mgmtpool,		   */
+						 * - mgmtpool, queues	   */
 	spinlock_t		back_lock;	/* protects cmdsn_exp      *
 						 * cmdsn_max,              *
 						 * cmdpool kfifo_in        */
-- 
cgit v1.2.3


From b4046922b3c0740ad50a6e9c59e12f4dc43946d4 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Sat, 6 Feb 2021 22:46:04 -0600
Subject: scsi: libiscsi: Add helper to calculate max SCSI cmds per session

This patch just breaks out the code that calculates the number of SCSI cmds
that will be used for a SCSI session. It also adds a check that we don't go
over the host's can_queue value.

Link: https://lore.kernel.org/r/20210207044608.27585-6-michael.christie@oracle.com
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libiscsi.c | 86 +++++++++++++++++++++++++++++++------------------
 include/scsi/libiscsi.h |  2 ++
 2 files changed, 56 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index b271d3accd2a..f64e2077754c 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2648,6 +2648,56 @@ void iscsi_pool_free(struct iscsi_pool *q)
 }
 EXPORT_SYMBOL_GPL(iscsi_pool_free);
 
+int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost,
+				 uint16_t requested_cmds_max)
+{
+	int scsi_cmds, total_cmds = requested_cmds_max;
+
+check:
+	if (!total_cmds)
+		total_cmds = ISCSI_DEF_XMIT_CMDS_MAX;
+	/*
+	 * The iscsi layer needs some tasks for nop handling and tmfs,
+	 * so the cmds_max must at least be greater than ISCSI_MGMT_CMDS_MAX
+	 * + 1 command for scsi IO.
+	 */
+	if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
+		printk(KERN_ERR "iscsi: invalid max cmds of %d. Must be a power of two that is at least %d.\n",
+		       total_cmds, ISCSI_TOTAL_CMDS_MIN);
+		return -EINVAL;
+	}
+
+	if (total_cmds > ISCSI_TOTAL_CMDS_MAX) {
+		printk(KERN_INFO "iscsi: invalid max cmds of %d. Must be a power of 2 less than or equal to %d. Using %d.\n",
+		       requested_cmds_max, ISCSI_TOTAL_CMDS_MAX,
+		       ISCSI_TOTAL_CMDS_MAX);
+		total_cmds = ISCSI_TOTAL_CMDS_MAX;
+	}
+
+	if (!is_power_of_2(total_cmds)) {
+		total_cmds = rounddown_pow_of_two(total_cmds);
+		if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
+			printk(KERN_ERR "iscsi: invalid max cmds of %d. Must be a power of 2 greater than %d.\n", requested_cmds_max, ISCSI_TOTAL_CMDS_MIN);
+			return -EINVAL;
+		}
+
+		printk(KERN_INFO "iscsi: invalid max cmds %d. Must be a power of 2. Rounding max cmds down to %d.\n",
+		       requested_cmds_max, total_cmds);
+	}
+
+	scsi_cmds = total_cmds - ISCSI_MGMT_CMDS_MAX;
+	if (shost->can_queue && scsi_cmds > shost->can_queue) {
+		total_cmds = shost->can_queue;
+
+		printk(KERN_INFO "iscsi: requested max cmds %u is higher than driver limit. Using driver limit %u\n",
+		       requested_cmds_max, shost->can_queue);
+		goto check;
+	}
+
+	return scsi_cmds;
+}
+EXPORT_SYMBOL_GPL(iscsi_host_get_max_scsi_cmds);
+
 /**
  * iscsi_host_add - add host to system
  * @shost: scsi host
@@ -2801,7 +2851,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 	struct iscsi_host *ihost = shost_priv(shost);
 	struct iscsi_session *session;
 	struct iscsi_cls_session *cls_session;
-	int cmd_i, scsi_cmds, total_cmds = cmds_max;
+	int cmd_i, scsi_cmds;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ihost->lock, flags);
@@ -2812,37 +2862,9 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 	ihost->num_sessions++;
 	spin_unlock_irqrestore(&ihost->lock, flags);
 
-	if (!total_cmds)
-		total_cmds = ISCSI_DEF_XMIT_CMDS_MAX;
-	/*
-	 * The iscsi layer needs some tasks for nop handling and tmfs,
-	 * so the cmds_max must at least be greater than ISCSI_MGMT_CMDS_MAX
-	 * + 1 command for scsi IO.
-	 */
-	if (total_cmds < ISCSI_TOTAL_CMDS_MIN) {
-		printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
-		       "must be a power of two that is at least %d.\n",
-		       total_cmds, ISCSI_TOTAL_CMDS_MIN);
+	scsi_cmds = iscsi_host_get_max_scsi_cmds(shost, cmds_max);
+	if (scsi_cmds < 0)
 		goto dec_session_count;
-	}
-
-	if (total_cmds > ISCSI_TOTAL_CMDS_MAX) {
-		printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
-		       "must be a power of 2 less than or equal to %d.\n",
-		       cmds_max, ISCSI_TOTAL_CMDS_MAX);
-		total_cmds = ISCSI_TOTAL_CMDS_MAX;
-	}
-
-	if (!is_power_of_2(total_cmds)) {
-		printk(KERN_ERR "iscsi: invalid can_queue of %d. can_queue "
-		       "must be a power of 2.\n", total_cmds);
-		total_cmds = rounddown_pow_of_two(total_cmds);
-		if (total_cmds < ISCSI_TOTAL_CMDS_MIN)
-			goto dec_session_count;
-		printk(KERN_INFO "iscsi: Rounding can_queue to %d.\n",
-		       total_cmds);
-	}
-	scsi_cmds = total_cmds - ISCSI_MGMT_CMDS_MAX;
 
 	cls_session = iscsi_alloc_session(shost, iscsit,
 					  sizeof(struct iscsi_session) +
@@ -2858,7 +2880,7 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost,
 	session->lu_reset_timeout = 15;
 	session->abort_timeout = 10;
 	session->scsi_cmds_max = scsi_cmds;
-	session->cmds_max = total_cmds;
+	session->cmds_max = scsi_cmds + ISCSI_MGMT_CMDS_MAX;
 	session->queued_cmdsn = session->cmdsn = initial_cmdsn;
 	session->exp_cmdsn = initial_cmdsn + 1;
 	session->max_cmdsn = initial_cmdsn + 1;
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index 44a9554aea62..02f966e9358f 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -395,6 +395,8 @@ extern struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht,
 extern void iscsi_host_remove(struct Scsi_Host *shost);
 extern void iscsi_host_free(struct Scsi_Host *shost);
 extern int iscsi_target_alloc(struct scsi_target *starget);
+extern int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost,
+					uint16_t requested_cmds_max);
 
 /*
  * session management
-- 
cgit v1.2.3


From bf5c9cc8ad7fffd1f72df3baa5870449e4c16d1b Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Mon, 8 Feb 2021 08:37:05 +0100
Subject: mei: bus: change remove callback to return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of mei_cl_device_remove() so
passing an error value doesn't solve any problem. As most mei drivers'
remove callbacks return 0 unconditionally and returning a different value
doesn't have any effect, change this prototype to return void and return 0
unconditionally in mei_cl_device_remove(). The only driver that could
return an error value is modified to emit an explicit warning in the error
case.

Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210208073705.428185-3-uwe@kleine-koenig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c           | 5 ++---
 drivers/misc/mei/hdcp/mei_hdcp.c | 7 +++++--
 drivers/nfc/microread/mei.c      | 4 +---
 drivers/nfc/pn544/mei.c          | 4 +---
 drivers/watchdog/mei_wdt.c       | 4 +---
 include/linux/mei_cl_bus.h       | 2 +-
 6 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 393a41330a6b..580074e32599 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -881,17 +881,16 @@ static int mei_cl_device_remove(struct device *dev)
 {
 	struct mei_cl_device *cldev = to_mei_cl_device(dev);
 	struct mei_cl_driver *cldrv = to_mei_cl_driver(dev->driver);
-	int ret = 0;
 
 	if (cldrv->remove)
-		ret = cldrv->remove(cldev);
+		cldrv->remove(cldev);
 
 	mei_cldev_unregister_callbacks(cldev);
 
 	mei_cl_bus_module_put(cldev);
 	module_put(THIS_MODULE);
 
-	return ret;
+	return 0;
 }
 
 static ssize_t name_show(struct device *dev, struct device_attribute *a,
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index 9ae9669e46ea..8447ad4b7d47 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -845,16 +845,19 @@ enable_err_exit:
 	return ret;
 }
 
-static int mei_hdcp_remove(struct mei_cl_device *cldev)
+static void mei_hdcp_remove(struct mei_cl_device *cldev)
 {
 	struct i915_hdcp_comp_master *comp_master =
 						mei_cldev_get_drvdata(cldev);
+	int ret;
 
 	component_master_del(&cldev->dev, &mei_component_master_ops);
 	kfree(comp_master);
 	mei_cldev_set_drvdata(cldev, NULL);
 
-	return mei_cldev_disable(cldev);
+	ret = mei_cldev_disable(cldev);
+	if (ret)
+		dev_warn(&cldev->dev, "mei_cldev_disable() failed\n");
 }
 
 #define MEI_UUID_HDCP GUID_INIT(0xB638AB7E, 0x94E2, 0x4EA2, 0xA5, \
diff --git a/drivers/nfc/microread/mei.c b/drivers/nfc/microread/mei.c
index 5dad8847a9b3..8fa7771085eb 100644
--- a/drivers/nfc/microread/mei.c
+++ b/drivers/nfc/microread/mei.c
@@ -44,15 +44,13 @@ static int microread_mei_probe(struct mei_cl_device *cldev,
 	return 0;
 }
 
-static int microread_mei_remove(struct mei_cl_device *cldev)
+static void microread_mei_remove(struct mei_cl_device *cldev)
 {
 	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
 	microread_remove(phy->hdev);
 
 	nfc_mei_phy_free(phy);
-
-	return 0;
 }
 
 static struct mei_cl_device_id microread_mei_tbl[] = {
diff --git a/drivers/nfc/pn544/mei.c b/drivers/nfc/pn544/mei.c
index 579bc599f545..5c10aac085a4 100644
--- a/drivers/nfc/pn544/mei.c
+++ b/drivers/nfc/pn544/mei.c
@@ -42,7 +42,7 @@ static int pn544_mei_probe(struct mei_cl_device *cldev,
 	return 0;
 }
 
-static int pn544_mei_remove(struct mei_cl_device *cldev)
+static void pn544_mei_remove(struct mei_cl_device *cldev)
 {
 	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
@@ -51,8 +51,6 @@ static int pn544_mei_remove(struct mei_cl_device *cldev)
 	pn544_hci_remove(phy->hdev);
 
 	nfc_mei_phy_free(phy);
-
-	return 0;
 }
 
 static struct mei_cl_device_id pn544_mei_tbl[] = {
diff --git a/drivers/watchdog/mei_wdt.c b/drivers/watchdog/mei_wdt.c
index 5391bf3e6b11..53165e49c298 100644
--- a/drivers/watchdog/mei_wdt.c
+++ b/drivers/watchdog/mei_wdt.c
@@ -619,7 +619,7 @@ err_out:
 	return ret;
 }
 
-static int mei_wdt_remove(struct mei_cl_device *cldev)
+static void mei_wdt_remove(struct mei_cl_device *cldev)
 {
 	struct mei_wdt *wdt = mei_cldev_get_drvdata(cldev);
 
@@ -636,8 +636,6 @@ static int mei_wdt_remove(struct mei_cl_device *cldev)
 	dbgfs_unregister(wdt);
 
 	kfree(wdt);
-
-	return 0;
 }
 
 #define MEI_UUID_WD UUID_LE(0x05B79A6F, 0x4628, 0x4D7F, \
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 959ad7d850b4..07f5ef8fc456 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -68,7 +68,7 @@ struct mei_cl_driver {
 
 	int (*probe)(struct mei_cl_device *cldev,
 		     const struct mei_cl_device_id *id);
-	int (*remove)(struct mei_cl_device *cldev);
+	void (*remove)(struct mei_cl_device *cldev);
 };
 
 int __mei_cldev_driver_register(struct mei_cl_driver *cldrv,
-- 
cgit v1.2.3


From 9c5137aedd112f78a968bdd2325de2ea06df46c0 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:28 +0800
Subject: virt: acrn: Introduce VM management interfaces

The VM management interfaces expose several VM operations to ACRN
userspace via ioctls. For example, creating VM, starting VM, destroying
VM and so on.

The ACRN Hypervisor needs to exchange data with the ACRN userspace
during the VM operations. HSM provides VM operation ioctls to the ACRN
userspace and communicates with the ACRN Hypervisor for VM operations
via hypercalls.

HSM maintains a list of User VM. Each User VM will be bound to an
existing file descriptor of /dev/acrn_hsm. The User VM will be
destroyed when the file descriptor is closed.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-7-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |  1 +
 MAINTAINERS                                        |  1 +
 drivers/virt/acrn/Makefile                         |  2 +-
 drivers/virt/acrn/acrn_drv.h                       | 21 +++++-
 drivers/virt/acrn/hsm.c                            | 76 ++++++++++++++++++++-
 drivers/virt/acrn/hypercall.h                      | 78 ++++++++++++++++++++++
 drivers/virt/acrn/vm.c                             | 68 +++++++++++++++++++
 include/uapi/linux/acrn.h                          | 58 ++++++++++++++++
 8 files changed, 301 insertions(+), 4 deletions(-)
 create mode 100644 drivers/virt/acrn/hypercall.h
 create mode 100644 drivers/virt/acrn/vm.c
 create mode 100644 include/uapi/linux/acrn.h

(limited to 'include')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 19d730f9d136..76c27dbae96b 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -320,6 +320,7 @@ Code  Seq#    Include File                                           Comments
 0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
                                                                      <mailto:kenji@bitgate.com>
 0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
+0xA2  all    uapi/linux/acrn.h                                       ACRN hypervisor
 0xA3  80-8F                                                          Port ACL  in development:
                                                                      <mailto:tlewis@mindspring.com>
 0xA3  90-9F  linux/dtlk.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 4ac781da514f..6b5990a28b44 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -443,6 +443,7 @@ S:	Supported
 W:	https://projectacrn.org
 F:	Documentation/virt/acrn/
 F:	drivers/virt/acrn/
+F:	include/uapi/linux/acrn.h
 
 AD1889 ALSA SOUND DRIVER
 L:	linux-parisc@vger.kernel.org
diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 6920ed798aaf..cf8b4ed5e74e 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o
+acrn-y := hsm.o vm.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 29eedd696327..e5aba86cad8c 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -3,16 +3,35 @@
 #ifndef __ACRN_HSM_DRV_H
 #define __ACRN_HSM_DRV_H
 
+#include <linux/acrn.h>
+#include <linux/dev_printk.h>
+#include <linux/miscdevice.h>
 #include <linux/types.h>
 
+#include "hypercall.h"
+
+extern struct miscdevice acrn_dev;
+
 #define ACRN_INVALID_VMID (0xffffU)
 
+#define ACRN_VM_FLAG_DESTROYED		0U
 /**
  * struct acrn_vm - Properties of ACRN User VM.
+ * @list:	Entry within global list of all VMs
  * @vmid:	User VM ID
+ * @vcpu_num:	Number of virtual CPUs in the VM
+ * @flags:	Flags (ACRN_VM_FLAG_*) of the VM. This is VM flag management
+ *		in HSM which is different from the &acrn_vm_creation.vm_flag.
  */
 struct acrn_vm {
-	u16	vmid;
+	struct list_head	list;
+	u16			vmid;
+	int			vcpu_num;
+	unsigned long		flags;
 };
 
+struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
+			       struct acrn_vm_creation *vm_param);
+int acrn_vm_destroy(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index a8dcb250649d..5fd933471683 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -9,7 +9,6 @@
  *	Yakui Zhao <yakui.zhao@intel.com>
  */
 
-#include <linux/miscdevice.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -38,10 +37,82 @@ static int acrn_dev_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * HSM relies on hypercall layer of the ACRN hypervisor to do the
+ * sanity check against the input parameters.
+ */
+static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
+			   unsigned long ioctl_param)
+{
+	struct acrn_vm *vm = filp->private_data;
+	struct acrn_vm_creation *vm_param;
+	int ret = 0;
+
+	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
+		dev_dbg(acrn_dev.this_device,
+			"ioctl 0x%x: Invalid VM state!\n", cmd);
+		return -EINVAL;
+	}
+
+	switch (cmd) {
+	case ACRN_IOCTL_CREATE_VM:
+		vm_param = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_vm_creation));
+		if (IS_ERR(vm_param))
+			return PTR_ERR(vm_param);
+
+		if ((vm_param->reserved0 | vm_param->reserved1) != 0)
+			return -EINVAL;
+
+		vm = acrn_vm_create(vm, vm_param);
+		if (!vm) {
+			ret = -EINVAL;
+			kfree(vm_param);
+			break;
+		}
+
+		if (copy_to_user((void __user *)ioctl_param, vm_param,
+				 sizeof(struct acrn_vm_creation))) {
+			acrn_vm_destroy(vm);
+			ret = -EFAULT;
+		}
+
+		kfree(vm_param);
+		break;
+	case ACRN_IOCTL_START_VM:
+		ret = hcall_start_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to start VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_PAUSE_VM:
+		ret = hcall_pause_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to pause VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_RESET_VM:
+		ret = hcall_reset_vm(vm->vmid);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to restart VM %u!\n", vm->vmid);
+		break;
+	case ACRN_IOCTL_DESTROY_VM:
+		ret = acrn_vm_destroy(vm);
+		break;
+	default:
+		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
 static int acrn_dev_release(struct inode *inode, struct file *filp)
 {
 	struct acrn_vm *vm = filp->private_data;
 
+	acrn_vm_destroy(vm);
 	kfree(vm);
 	return 0;
 }
@@ -50,9 +121,10 @@ static const struct file_operations acrn_fops = {
 	.owner		= THIS_MODULE,
 	.open		= acrn_dev_open,
 	.release	= acrn_dev_release,
+	.unlocked_ioctl = acrn_dev_ioctl,
 };
 
-static struct miscdevice acrn_dev = {
+struct miscdevice acrn_dev = {
 	.minor	= MISC_DYNAMIC_MINOR,
 	.name	= "acrn_hsm",
 	.fops	= &acrn_fops,
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
new file mode 100644
index 000000000000..426b66cadb1f
--- /dev/null
+++ b/drivers/virt/acrn/hypercall.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ACRN HSM: hypercalls of ACRN Hypervisor
+ */
+#ifndef __ACRN_HSM_HYPERCALL_H
+#define __ACRN_HSM_HYPERCALL_H
+#include <asm/acrn.h>
+
+/*
+ * Hypercall IDs of the ACRN Hypervisor
+ */
+#define _HC_ID(x, y) (((x) << 24) | (y))
+
+#define HC_ID 0x80UL
+
+#define HC_ID_VM_BASE			0x10UL
+#define HC_CREATE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x00)
+#define HC_DESTROY_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x01)
+#define HC_START_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x02)
+#define HC_PAUSE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x03)
+#define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
+
+/**
+ * hcall_create_vm() - Create a User VM
+ * @vminfo:	Service VM GPA of info of User VM creation
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_create_vm(u64 vminfo)
+{
+	return acrn_hypercall1(HC_CREATE_VM, vminfo);
+}
+
+/**
+ * hcall_start_vm() - Start a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_start_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_START_VM, vmid);
+}
+
+/**
+ * hcall_pause_vm() - Pause a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_pause_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_PAUSE_VM, vmid);
+}
+
+/**
+ * hcall_destroy_vm() - Destroy a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_destroy_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_DESTROY_VM, vmid);
+}
+
+/**
+ * hcall_reset_vm() - Reset a User VM
+ * @vmid:	User VM ID
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_reset_vm(u64 vmid)
+{
+	return acrn_hypercall1(HC_RESET_VM, vmid);
+}
+
+#endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
new file mode 100644
index 000000000000..3f667ac8ac1e
--- /dev/null
+++ b/drivers/virt/acrn/vm.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN_HSM: Virtual Machine management
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Jason Chen CJ <jason.cj.chen@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+/* List of VMs */
+static LIST_HEAD(acrn_vm_list);
+/* To protect acrn_vm_list */
+static DEFINE_MUTEX(acrn_vm_list_lock);
+
+struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
+			       struct acrn_vm_creation *vm_param)
+{
+	int ret;
+
+	ret = hcall_create_vm(virt_to_phys(vm_param));
+	if (ret < 0 || vm_param->vmid == ACRN_INVALID_VMID) {
+		dev_err(acrn_dev.this_device,
+			"Failed to create VM! Error: %d\n", ret);
+		return NULL;
+	}
+
+	vm->vmid = vm_param->vmid;
+	vm->vcpu_num = vm_param->vcpu_num;
+
+	mutex_lock(&acrn_vm_list_lock);
+	list_add(&vm->list, &acrn_vm_list);
+	mutex_unlock(&acrn_vm_list_lock);
+
+	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
+	return vm;
+}
+
+int acrn_vm_destroy(struct acrn_vm *vm)
+{
+	int ret;
+
+	if (vm->vmid == ACRN_INVALID_VMID ||
+	    test_and_set_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags))
+		return 0;
+
+	/* Remove from global VM list */
+	mutex_lock(&acrn_vm_list_lock);
+	list_del_init(&vm->list);
+	mutex_unlock(&acrn_vm_list_lock);
+
+	ret = hcall_destroy_vm(vm->vmid);
+	if (ret < 0) {
+		dev_err(acrn_dev.this_device,
+			"Failed to destroy VM %u\n", vm->vmid);
+		clear_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags);
+		return ret;
+	}
+	dev_dbg(acrn_dev.this_device, "VM %u destroyed.\n", vm->vmid);
+	vm->vmid = ACRN_INVALID_VMID;
+	return 0;
+}
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
new file mode 100644
index 000000000000..32521168075f
--- /dev/null
+++ b/include/uapi/linux/acrn.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Userspace interface for /dev/acrn_hsm - ACRN Hypervisor Service Module
+ *
+ * This file can be used by applications that need to communicate with the HSM
+ * via the ioctl interface.
+ *
+ * Copyright (C) 2021 Intel Corporation. All rights reserved.
+ */
+
+#ifndef _UAPI_ACRN_H
+#define _UAPI_ACRN_H
+
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/**
+ * struct acrn_vm_creation - Info to create a User VM
+ * @vmid:		User VM ID returned from the hypervisor
+ * @reserved0:		Reserved and must be 0
+ * @vcpu_num:		Number of vCPU in the VM. Return from hypervisor.
+ * @reserved1:		Reserved and must be 0
+ * @uuid:		UUID of the VM. Pass to hypervisor directly.
+ * @vm_flag:		Flag of the VM creating. Pass to hypervisor directly.
+ * @ioreq_buf:		Service VM GPA of I/O request buffer. Pass to
+ *			hypervisor directly.
+ * @cpu_affinity:	CPU affinity of the VM. Pass to hypervisor directly.
+ * 			It's a bitmap which indicates CPUs used by the VM.
+ */
+struct acrn_vm_creation {
+	__u16	vmid;
+	__u16	reserved0;
+	__u16	vcpu_num;
+	__u16	reserved1;
+	guid_t	uuid;
+	__u64	vm_flag;
+	__u64	ioreq_buf;
+	__u64	cpu_affinity;
+};
+
+/* The ioctl type, documented in ioctl-number.rst */
+#define ACRN_IOCTL_TYPE			0xA2
+
+/*
+ * Common IOCTL IDs definition for ACRN userspace
+ */
+#define ACRN_IOCTL_CREATE_VM		\
+	_IOWR(ACRN_IOCTL_TYPE, 0x10, struct acrn_vm_creation)
+#define ACRN_IOCTL_DESTROY_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x11)
+#define ACRN_IOCTL_START_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x12)
+#define ACRN_IOCTL_PAUSE_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x13)
+#define ACRN_IOCTL_RESET_VM		\
+	_IO(ACRN_IOCTL_TYPE, 0x15)
+
+#endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From 2ad2aaee1bc9568d0c146463483d2c926ef20055 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:29 +0800
Subject: virt: acrn: Introduce an ioctl to set vCPU registers state

A virtual CPU of User VM has different context due to the different
registers state. ACRN userspace needs to set the virtual CPU
registers state (e.g. giving a initial registers state to a virtual
BSP of a User VM).

HSM provides an ioctl ACRN_IOCTL_SET_VCPU_REGS to do the virtual CPU
registers state setting. The ioctl passes the registers state from ACRN
userspace to the hypervisor directly.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-8-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       |  34 +++++++++++-
 drivers/virt/acrn/hypercall.h |  13 +++++
 include/uapi/linux/acrn.h     | 119 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 5fd933471683..ee5cc7413239 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -9,6 +9,7 @@
  *	Yakui Zhao <yakui.zhao@intel.com>
  */
 
+#include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -46,7 +47,8 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 {
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
-	int ret = 0;
+	struct acrn_vcpu_regs *cpu_regs;
+	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
 		dev_dbg(acrn_dev.this_device,
@@ -100,6 +102,36 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	case ACRN_IOCTL_DESTROY_VM:
 		ret = acrn_vm_destroy(vm);
 		break;
+	case ACRN_IOCTL_SET_VCPU_REGS:
+		cpu_regs = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_vcpu_regs));
+		if (IS_ERR(cpu_regs))
+			return PTR_ERR(cpu_regs);
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->reserved); i++)
+			if (cpu_regs->reserved[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_32); i++)
+			if (cpu_regs->vcpu_regs.reserved_32[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.reserved_64); i++)
+			if (cpu_regs->vcpu_regs.reserved_64[i])
+				return -EINVAL;
+
+		for (i = 0; i < ARRAY_SIZE(cpu_regs->vcpu_regs.gdt.reserved); i++)
+			if (cpu_regs->vcpu_regs.gdt.reserved[i] |
+			    cpu_regs->vcpu_regs.idt.reserved[i])
+				return -EINVAL;
+
+		ret = hcall_set_vcpu_regs(vm->vmid, virt_to_phys(cpu_regs));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to set regs state of VM%u!\n",
+				vm->vmid);
+		kfree(cpu_regs);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index 426b66cadb1f..f29cfae08862 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -19,6 +19,7 @@
 #define HC_START_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x02)
 #define HC_PAUSE_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x03)
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
+#define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
 /**
  * hcall_create_vm() - Create a User VM
@@ -75,4 +76,16 @@ static inline long hcall_reset_vm(u64 vmid)
 	return acrn_hypercall1(HC_RESET_VM, vmid);
 }
 
+/**
+ * hcall_set_vcpu_regs() - Set up registers of virtual CPU of a User VM
+ * @vmid:	User VM ID
+ * @regs_state:	Service VM GPA of registers state
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
+{
+	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 32521168075f..775c58bad026 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -38,6 +38,123 @@ struct acrn_vm_creation {
 	__u64	cpu_affinity;
 };
 
+/**
+ * struct acrn_gp_regs - General registers of a User VM
+ * @rax:	Value of register RAX
+ * @rcx:	Value of register RCX
+ * @rdx:	Value of register RDX
+ * @rbx:	Value of register RBX
+ * @rsp:	Value of register RSP
+ * @rbp:	Value of register RBP
+ * @rsi:	Value of register RSI
+ * @rdi:	Value of register RDI
+ * @r8:		Value of register R8
+ * @r9:		Value of register R9
+ * @r10:	Value of register R10
+ * @r11:	Value of register R11
+ * @r12:	Value of register R12
+ * @r13:	Value of register R13
+ * @r14:	Value of register R14
+ * @r15:	Value of register R15
+ */
+struct acrn_gp_regs {
+	__le64	rax;
+	__le64	rcx;
+	__le64	rdx;
+	__le64	rbx;
+	__le64	rsp;
+	__le64	rbp;
+	__le64	rsi;
+	__le64	rdi;
+	__le64	r8;
+	__le64	r9;
+	__le64	r10;
+	__le64	r11;
+	__le64	r12;
+	__le64	r13;
+	__le64	r14;
+	__le64	r15;
+};
+
+/**
+ * struct acrn_descriptor_ptr - Segment descriptor table of a User VM.
+ * @limit:	Limit field.
+ * @base:	Base field.
+ * @reserved:	Reserved and must be 0.
+ */
+struct acrn_descriptor_ptr {
+	__le16	limit;
+	__le64	base;
+	__le16	reserved[3];
+} __attribute__ ((__packed__));
+
+/**
+ * struct acrn_regs - Registers structure of a User VM
+ * @gprs:		General registers
+ * @gdt:		Global Descriptor Table
+ * @idt:		Interrupt Descriptor Table
+ * @rip:		Value of register RIP
+ * @cs_base:		Base of code segment selector
+ * @cr0:		Value of register CR0
+ * @cr4:		Value of register CR4
+ * @cr3:		Value of register CR3
+ * @ia32_efer:		Value of IA32_EFER MSR
+ * @rflags:		Value of regsiter RFLAGS
+ * @reserved_64:	Reserved and must be 0
+ * @cs_ar:		Attribute field of code segment selector
+ * @cs_limit:		Limit field of code segment selector
+ * @reserved_32:	Reserved and must be 0
+ * @cs_sel:		Value of code segment selector
+ * @ss_sel:		Value of stack segment selector
+ * @ds_sel:		Value of data segment selector
+ * @es_sel:		Value of extra segment selector
+ * @fs_sel:		Value of FS selector
+ * @gs_sel:		Value of GS selector
+ * @ldt_sel:		Value of LDT descriptor selector
+ * @tr_sel:		Value of TSS descriptor selector
+ */
+struct acrn_regs {
+	struct acrn_gp_regs		gprs;
+	struct acrn_descriptor_ptr	gdt;
+	struct acrn_descriptor_ptr	idt;
+
+	__le64				rip;
+	__le64				cs_base;
+	__le64				cr0;
+	__le64				cr4;
+	__le64				cr3;
+	__le64				ia32_efer;
+	__le64				rflags;
+	__le64				reserved_64[4];
+
+	__le32				cs_ar;
+	__le32				cs_limit;
+	__le32				reserved_32[3];
+
+	__le16				cs_sel;
+	__le16				ss_sel;
+	__le16				ds_sel;
+	__le16				es_sel;
+	__le16				fs_sel;
+	__le16				gs_sel;
+	__le16				ldt_sel;
+	__le16				tr_sel;
+};
+
+/**
+ * struct acrn_vcpu_regs - Info of vCPU registers state
+ * @vcpu_id:	vCPU ID
+ * @reserved:	Reserved and must be 0
+ * @vcpu_regs:	vCPU registers state
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_vcpu_regs {
+	__u16			vcpu_id;
+	__u16			reserved[3];
+	struct acrn_regs	vcpu_regs;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -54,5 +171,7 @@ struct acrn_vm_creation {
 	_IO(ACRN_IOCTL_TYPE, 0x13)
 #define ACRN_IOCTL_RESET_VM		\
 	_IO(ACRN_IOCTL_TYPE, 0x15)
+#define ACRN_IOCTL_SET_VCPU_REGS	\
+	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From 88f537d5e8ddc89c2622f4a2bc1eb28455e8339c Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:30 +0800
Subject: virt: acrn: Introduce EPT mapping management

The HSM provides hypervisor services to the ACRN userspace. While
launching a User VM, ACRN userspace needs to allocate memory and request
the ACRN Hypervisor to set up the EPT mapping for the VM.

A mapping cache is introduced for accelerating the translation between
the Service VM kernel virtual address and User VM physical address.

>From the perspective of the hypervisor, the types of GPA of User VM can be
listed as following:
   1) RAM region, which is used by User VM as system ram.
   2) MMIO region, which is recognized by User VM as MMIO. MMIO region is
      used to be utilized for devices emulation.

Generally, User VM RAM regions mapping is set up before VM started and
is released in the User VM destruction. MMIO regions mapping may be set
and unset dynamically during User VM running.

To achieve this, ioctls ACRN_IOCTL_SET_MEMSEG and ACRN_IOCTL_UNSET_MEMSEG
are introduced in HSM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-9-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  96 +++++++++++--
 drivers/virt/acrn/hsm.c       |  15 +++
 drivers/virt/acrn/hypercall.h |  14 ++
 drivers/virt/acrn/mm.c        | 306 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |   4 +
 include/uapi/linux/acrn.h     |  49 +++++++
 7 files changed, 476 insertions(+), 10 deletions(-)
 create mode 100644 drivers/virt/acrn/mm.c

(limited to 'include')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index cf8b4ed5e74e..38bc44b6edcd 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o
+acrn-y := hsm.o vm.o mm.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e5aba86cad8c..e47a45280eea 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -12,26 +12,104 @@
 
 extern struct miscdevice acrn_dev;
 
+#define ACRN_MEM_MAPPING_MAX	256
+
+#define ACRN_MEM_REGION_ADD	0
+#define ACRN_MEM_REGION_DEL	2
+/**
+ * struct vm_memory_region_op - Hypervisor memory operation
+ * @type:		Operation type (ACRN_MEM_REGION_*)
+ * @attr:		Memory attribute (ACRN_MEM_TYPE_* | ACRN_MEM_ACCESS_*)
+ * @user_vm_pa:		Physical address of User VM to be mapped.
+ * @service_vm_pa:	Physical address of Service VM to be mapped.
+ * @size:		Size of this region.
+ *
+ * Structure containing needed information that is provided to ACRN Hypervisor
+ * to manage the EPT mappings of a single memory region of the User VM. Several
+ * &struct vm_memory_region_op can be batched to ACRN Hypervisor, see &struct
+ * vm_memory_region_batch.
+ */
+struct vm_memory_region_op {
+	u32	type;
+	u32	attr;
+	u64	user_vm_pa;
+	u64	service_vm_pa;
+	u64	size;
+};
+
+/**
+ * struct vm_memory_region_batch - A batch of vm_memory_region_op.
+ * @vmid:		A User VM ID.
+ * @reserved:		Reserved.
+ * @regions_num:	The number of vm_memory_region_op.
+ * @regions_gpa:	Physical address of a vm_memory_region_op array.
+ *
+ * HC_VM_SET_MEMORY_REGIONS uses this structure to manage EPT mappings of
+ * multiple memory regions of a User VM. A &struct vm_memory_region_batch
+ * contains multiple &struct vm_memory_region_op for batch processing in the
+ * ACRN Hypervisor.
+ */
+struct vm_memory_region_batch {
+	u16	vmid;
+	u16	reserved[3];
+	u32	regions_num;
+	u64	regions_gpa;
+};
+
+/**
+ * struct vm_memory_mapping - Memory map between a User VM and the Service VM
+ * @pages:		Pages in Service VM kernel.
+ * @npages:		Number of pages.
+ * @service_vm_va:	Virtual address in Service VM kernel.
+ * @user_vm_pa:		Physical address in User VM.
+ * @size:		Size of this memory region.
+ *
+ * HSM maintains memory mappings between a User VM GPA and the Service VM
+ * kernel VA for accelerating the User VM GPA translation.
+ */
+struct vm_memory_mapping {
+	struct page	**pages;
+	int		npages;
+	void		*service_vm_va;
+	u64		user_vm_pa;
+	size_t		size;
+};
+
 #define ACRN_INVALID_VMID (0xffffU)
 
 #define ACRN_VM_FLAG_DESTROYED		0U
 /**
  * struct acrn_vm - Properties of ACRN User VM.
- * @list:	Entry within global list of all VMs
- * @vmid:	User VM ID
- * @vcpu_num:	Number of virtual CPUs in the VM
- * @flags:	Flags (ACRN_VM_FLAG_*) of the VM. This is VM flag management
- *		in HSM which is different from the &acrn_vm_creation.vm_flag.
+ * @list:			Entry within global list of all VMs.
+ * @vmid:			User VM ID.
+ * @vcpu_num:			Number of virtual CPUs in the VM.
+ * @flags:			Flags (ACRN_VM_FLAG_*) of the VM. This is VM
+ *				flag management in HSM which is different
+ *				from the &acrn_vm_creation.vm_flag.
+ * @regions_mapping_lock:	Lock to protect &acrn_vm.regions_mapping and
+ *				&acrn_vm.regions_mapping_count.
+ * @regions_mapping:		Memory mappings of this VM.
+ * @regions_mapping_count:	Number of memory mapping of this VM.
  */
 struct acrn_vm {
-	struct list_head	list;
-	u16			vmid;
-	int			vcpu_num;
-	unsigned long		flags;
+	struct list_head		list;
+	u16				vmid;
+	int				vcpu_num;
+	unsigned long			flags;
+	struct mutex			regions_mapping_lock;
+	struct vm_memory_mapping	regions_mapping[ACRN_MEM_MAPPING_MAX];
+	int				regions_mapping_count;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 			       struct acrn_vm_creation *vm_param);
 int acrn_vm_destroy(struct acrn_vm *vm);
+int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa,
+		       u64 size, u32 mem_type, u32 mem_access_right);
+int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size);
+int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
+void acrn_vm_all_ram_unmap(struct acrn_vm *vm);
 
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index ee5cc7413239..2c40d3dc5e94 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -48,6 +48,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
+	struct acrn_vm_memmap memmap;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -132,6 +133,20 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 				vm->vmid);
 		kfree(cpu_regs);
 		break;
+	case ACRN_IOCTL_SET_MEMSEG:
+		if (copy_from_user(&memmap, (void __user *)ioctl_param,
+				   sizeof(memmap)))
+			return -EFAULT;
+
+		ret = acrn_vm_memseg_map(vm, &memmap);
+		break;
+	case ACRN_IOCTL_UNSET_MEMSEG:
+		if (copy_from_user(&memmap, (void __user *)ioctl_param,
+				   sizeof(memmap)))
+			return -EFAULT;
+
+		ret = acrn_vm_memseg_unmap(vm, &memmap);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index f29cfae08862..a1a70a071713 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,9 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_MEM_BASE			0x40UL
+#define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -88,4 +91,15 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_set_memory_regions() - Inform the hypervisor to set up EPT mappings
+ * @regions_pa:	Service VM GPA of &struct vm_memory_region_batch
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_memory_regions(u64 regions_pa)
+{
+	return acrn_hypercall1(HC_VM_SET_MEMORY_REGIONS, regions_pa);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
new file mode 100644
index 000000000000..c4f2e15c8a2b
--- /dev/null
+++ b/drivers/virt/acrn/mm.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN: Memory mapping management
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Fei Li <lei1.li@intel.com>
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ */
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+static int modify_region(struct acrn_vm *vm, struct vm_memory_region_op *region)
+{
+	struct vm_memory_region_batch *regions;
+	int ret;
+
+	regions = kzalloc(sizeof(*regions), GFP_KERNEL);
+	if (!regions)
+		return -ENOMEM;
+
+	regions->vmid = vm->vmid;
+	regions->regions_num = 1;
+	regions->regions_gpa = virt_to_phys(region);
+
+	ret = hcall_set_memory_regions(virt_to_phys(regions));
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Failed to set memory region for VM[%u]!\n", vm->vmid);
+
+	kfree(regions);
+	return ret;
+}
+
+/**
+ * acrn_mm_region_add() - Set up the EPT mapping of a memory region.
+ * @vm:			User VM.
+ * @user_gpa:		A GPA of User VM.
+ * @service_gpa:	A GPA of Service VM.
+ * @size:		Size of the region.
+ * @mem_type:		Combination of ACRN_MEM_TYPE_*.
+ * @mem_access_right:	Combination of ACRN_MEM_ACCESS_*.
+ *
+ * Return: 0 on success, <0 on error.
+ */
+int acrn_mm_region_add(struct acrn_vm *vm, u64 user_gpa, u64 service_gpa,
+		       u64 size, u32 mem_type, u32 mem_access_right)
+{
+	struct vm_memory_region_op *region;
+	int ret = 0;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->type = ACRN_MEM_REGION_ADD;
+	region->user_vm_pa = user_gpa;
+	region->service_vm_pa = service_gpa;
+	region->size = size;
+	region->attr = ((mem_type & ACRN_MEM_TYPE_MASK) |
+			(mem_access_right & ACRN_MEM_ACCESS_RIGHT_MASK));
+	ret = modify_region(vm, region);
+
+	dev_dbg(acrn_dev.this_device,
+		"%s: user-GPA[%pK] service-GPA[%pK] size[0x%llx].\n",
+		__func__, (void *)user_gpa, (void *)service_gpa, size);
+	kfree(region);
+	return ret;
+}
+
+/**
+ * acrn_mm_region_del() - Del the EPT mapping of a memory region.
+ * @vm:		User VM.
+ * @user_gpa:	A GPA of the User VM.
+ * @size:	Size of the region.
+ *
+ * Return: 0 on success, <0 for error.
+ */
+int acrn_mm_region_del(struct acrn_vm *vm, u64 user_gpa, u64 size)
+{
+	struct vm_memory_region_op *region;
+	int ret = 0;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->type = ACRN_MEM_REGION_DEL;
+	region->user_vm_pa = user_gpa;
+	region->service_vm_pa = 0UL;
+	region->size = size;
+	region->attr = 0U;
+
+	ret = modify_region(vm, region);
+
+	dev_dbg(acrn_dev.this_device, "%s: user-GPA[%pK] size[0x%llx].\n",
+		__func__, (void *)user_gpa, size);
+	kfree(region);
+	return ret;
+}
+
+int acrn_vm_memseg_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	int ret;
+
+	if (memmap->type == ACRN_MEMMAP_RAM)
+		return acrn_vm_ram_map(vm, memmap);
+
+	if (memmap->type != ACRN_MEMMAP_MMIO) {
+		dev_dbg(acrn_dev.this_device,
+			"Invalid memmap type: %u\n", memmap->type);
+		return -EINVAL;
+	}
+
+	ret = acrn_mm_region_add(vm, memmap->user_vm_pa,
+				 memmap->service_vm_pa, memmap->len,
+				 ACRN_MEM_TYPE_UC, memmap->attr);
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Add memory region failed, VM[%u]!\n", vm->vmid);
+
+	return ret;
+}
+
+int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	int ret;
+
+	if (memmap->type != ACRN_MEMMAP_MMIO) {
+		dev_dbg(acrn_dev.this_device,
+			"Invalid memmap type: %u\n", memmap->type);
+		return -EINVAL;
+	}
+
+	ret = acrn_mm_region_del(vm, memmap->user_vm_pa, memmap->len);
+	if (ret < 0)
+		dev_dbg(acrn_dev.this_device,
+			"Del memory region failed, VM[%u]!\n", vm->vmid);
+
+	return ret;
+}
+
+/**
+ * acrn_vm_ram_map() - Create a RAM EPT mapping of User VM.
+ * @vm:		The User VM pointer
+ * @memmap:	Info of the EPT mapping
+ *
+ * Return: 0 on success, <0 for error.
+ */
+int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
+{
+	struct vm_memory_region_batch *regions_info;
+	int nr_pages, i = 0, order, nr_regions = 0;
+	struct vm_memory_mapping *region_mapping;
+	struct vm_memory_region_op *vm_region;
+	struct page **pages = NULL, *page;
+	void *remap_vaddr;
+	int ret, pinned;
+	u64 user_vm_pa;
+
+	if (!vm || !memmap)
+		return -EINVAL;
+
+	/* Get the page number of the map region */
+	nr_pages = memmap->len >> PAGE_SHIFT;
+	pages = vzalloc(nr_pages * sizeof(struct page *));
+	if (!pages)
+		return -ENOMEM;
+
+	/* Lock the pages of user memory map region */
+	pinned = pin_user_pages_fast(memmap->vma_base,
+				     nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+				     pages);
+	if (pinned < 0) {
+		ret = pinned;
+		goto free_pages;
+	} else if (pinned != nr_pages) {
+		ret = -EFAULT;
+		goto put_pages;
+	}
+
+	/* Create a kernel map for the map region */
+	remap_vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!remap_vaddr) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
+
+	/* Record Service VM va <-> User VM pa mapping */
+	mutex_lock(&vm->regions_mapping_lock);
+	region_mapping = &vm->regions_mapping[vm->regions_mapping_count];
+	if (vm->regions_mapping_count < ACRN_MEM_MAPPING_MAX) {
+		region_mapping->pages = pages;
+		region_mapping->npages = nr_pages;
+		region_mapping->size = memmap->len;
+		region_mapping->service_vm_va = remap_vaddr;
+		region_mapping->user_vm_pa = memmap->user_vm_pa;
+		vm->regions_mapping_count++;
+	} else {
+		dev_warn(acrn_dev.this_device,
+			"Run out of memory mapping slots!\n");
+		ret = -ENOMEM;
+		mutex_unlock(&vm->regions_mapping_lock);
+		goto unmap_no_count;
+	}
+	mutex_unlock(&vm->regions_mapping_lock);
+
+	/* Calculate count of vm_memory_region_op */
+	while (i < nr_pages) {
+		page = pages[i];
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		order = compound_order(page);
+		nr_regions++;
+		i += 1 << order;
+	}
+
+	/* Prepare the vm_memory_region_batch */
+	regions_info = kzalloc(sizeof(*regions_info) +
+			       sizeof(*vm_region) * nr_regions,
+			       GFP_KERNEL);
+	if (!regions_info) {
+		ret = -ENOMEM;
+		goto unmap_kernel_map;
+	}
+
+	/* Fill each vm_memory_region_op */
+	vm_region = (struct vm_memory_region_op *)(regions_info + 1);
+	regions_info->vmid = vm->vmid;
+	regions_info->regions_num = nr_regions;
+	regions_info->regions_gpa = virt_to_phys(vm_region);
+	user_vm_pa = memmap->user_vm_pa;
+	i = 0;
+	while (i < nr_pages) {
+		u32 region_size;
+
+		page = pages[i];
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		order = compound_order(page);
+		region_size = PAGE_SIZE << order;
+		vm_region->type = ACRN_MEM_REGION_ADD;
+		vm_region->user_vm_pa = user_vm_pa;
+		vm_region->service_vm_pa = page_to_phys(page);
+		vm_region->size = region_size;
+		vm_region->attr = (ACRN_MEM_TYPE_WB & ACRN_MEM_TYPE_MASK) |
+				  (memmap->attr & ACRN_MEM_ACCESS_RIGHT_MASK);
+
+		vm_region++;
+		user_vm_pa += region_size;
+		i += 1 << order;
+	}
+
+	/* Inform the ACRN Hypervisor to set up EPT mappings */
+	ret = hcall_set_memory_regions(virt_to_phys(regions_info));
+	if (ret < 0) {
+		dev_dbg(acrn_dev.this_device,
+			"Failed to set regions, VM[%u]!\n", vm->vmid);
+		goto unset_region;
+	}
+	kfree(regions_info);
+
+	dev_dbg(acrn_dev.this_device,
+		"%s: VM[%u] service-GVA[%pK] user-GPA[%pK] size[0x%llx]\n",
+		__func__, vm->vmid,
+		remap_vaddr, (void *)memmap->user_vm_pa, memmap->len);
+	return ret;
+
+unset_region:
+	kfree(regions_info);
+unmap_kernel_map:
+	mutex_lock(&vm->regions_mapping_lock);
+	vm->regions_mapping_count--;
+	mutex_unlock(&vm->regions_mapping_lock);
+unmap_no_count:
+	vunmap(remap_vaddr);
+put_pages:
+	for (i = 0; i < pinned; i++)
+		unpin_user_page(pages[i]);
+free_pages:
+	vfree(pages);
+	return ret;
+}
+
+/**
+ * acrn_vm_all_ram_unmap() - Destroy a RAM EPT mapping of User VM.
+ * @vm:	The User VM
+ */
+void acrn_vm_all_ram_unmap(struct acrn_vm *vm)
+{
+	struct vm_memory_mapping *region_mapping;
+	int i, j;
+
+	mutex_lock(&vm->regions_mapping_lock);
+	for (i = 0; i < vm->regions_mapping_count; i++) {
+		region_mapping = &vm->regions_mapping[i];
+		vunmap(region_mapping->service_vm_va);
+		for (j = 0; j < region_mapping->npages; j++)
+			unpin_user_page(region_mapping->pages[j]);
+		vfree(region_mapping->pages);
+	}
+	mutex_unlock(&vm->regions_mapping_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 3f667ac8ac1e..ff5df7acb551 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -31,6 +31,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 		return NULL;
 	}
 
+	mutex_init(&vm->regions_mapping_lock);
 	vm->vmid = vm_param->vmid;
 	vm->vcpu_num = vm_param->vcpu_num;
 
@@ -62,6 +63,9 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 		clear_bit(ACRN_VM_FLAG_DESTROYED, &vm->flags);
 		return ret;
 	}
+
+	acrn_vm_all_ram_unmap(vm);
+
 	dev_dbg(acrn_dev.this_device, "VM %u destroyed.\n", vm->vmid);
 	vm->vmid = ACRN_INVALID_VMID;
 	return 0;
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 775c58bad026..ec4c61e7c170 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -155,6 +155,50 @@ struct acrn_vcpu_regs {
 	struct acrn_regs	vcpu_regs;
 };
 
+#define	ACRN_MEM_ACCESS_RIGHT_MASK	0x00000007U
+#define	ACRN_MEM_ACCESS_READ		0x00000001U
+#define	ACRN_MEM_ACCESS_WRITE		0x00000002U
+#define	ACRN_MEM_ACCESS_EXEC		0x00000004U
+#define	ACRN_MEM_ACCESS_RWX		(ACRN_MEM_ACCESS_READ  | \
+					 ACRN_MEM_ACCESS_WRITE | \
+					 ACRN_MEM_ACCESS_EXEC)
+
+#define	ACRN_MEM_TYPE_MASK		0x000007C0U
+#define	ACRN_MEM_TYPE_WB		0x00000040U
+#define	ACRN_MEM_TYPE_WT		0x00000080U
+#define	ACRN_MEM_TYPE_UC		0x00000100U
+#define	ACRN_MEM_TYPE_WC		0x00000200U
+#define	ACRN_MEM_TYPE_WP		0x00000400U
+
+/* Memory mapping types */
+#define	ACRN_MEMMAP_RAM			0
+#define	ACRN_MEMMAP_MMIO		1
+
+/**
+ * struct acrn_vm_memmap - A EPT memory mapping info for a User VM.
+ * @type:		Type of the memory mapping (ACRM_MEMMAP_*).
+ *			Pass to hypervisor directly.
+ * @attr:		Attribute of the memory mapping.
+ *			Pass to hypervisor directly.
+ * @user_vm_pa:		Physical address of User VM.
+ *			Pass to hypervisor directly.
+ * @service_vm_pa:	Physical address of Service VM.
+ *			Pass to hypervisor directly.
+ * @vma_base:		VMA address of Service VM. Pass to hypervisor directly.
+ * @len:		Length of the memory mapping.
+ *			Pass to hypervisor directly.
+ */
+struct acrn_vm_memmap {
+	__u32	type;
+	__u32	attr;
+	__u64	user_vm_pa;
+	union {
+		__u64	service_vm_pa;
+		__u64	vma_base;
+	};
+	__u64	len;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -174,4 +218,9 @@ struct acrn_vcpu_regs {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_SET_MEMSEG		\
+	_IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap)
+#define ACRN_IOCTL_UNSET_MEMSEG		\
+	_IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From 72f293de3ff40b57db573c1bf623f494f3446f74 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:31 +0800
Subject: virt: acrn: Introduce I/O request management

An I/O request of a User VM, which is constructed by the hypervisor, is
distributed by the ACRN Hypervisor Service Module to an I/O client
corresponding to the address range of the I/O request.

For each User VM, there is a shared 4-KByte memory region used for I/O
requests communication between the hypervisor and Service VM. An I/O
request is a 256-byte structure buffer, which is 'struct
acrn_io_request', that is filled by an I/O handler of the hypervisor
when a trapped I/O access happens in a User VM. ACRN userspace in the
Service VM first allocates a 4-KByte page and passes the GPA (Guest
Physical Address) of the buffer to the hypervisor. The buffer is used as
an array of 16 I/O request slots with each I/O request slot being 256
bytes. This array is indexed by vCPU ID.

An I/O client, which is 'struct acrn_ioreq_client', is responsible for
handling User VM I/O requests whose accessed GPA falls in a certain
range. Multiple I/O clients can be associated with each User VM. There
is a special client associated with each User VM, called the default
client, that handles all I/O requests that do not fit into the range of
any other I/O clients. The ACRN userspace acts as the default client for
each User VM.

The state transitions of a ACRN I/O request are as follows.

   FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ...

FREE: this I/O request slot is empty
PENDING: a valid I/O request is pending in this slot
PROCESSING: the I/O request is being processed
COMPLETE: the I/O request has been processed

An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM
and ACRN userspace are in charge of processing the others.

The processing flow of I/O requests are listed as following:

a) The I/O handler of the hypervisor will fill an I/O request with
   PENDING state when a trapped I/O access happens in a User VM.
b) The hypervisor makes an upcall, which is a notification interrupt, to
   the Service VM.
c) The upcall handler schedules a worker to dispatch I/O requests.
d) The worker looks for the PENDING I/O requests, assigns them to
   different registered clients based on the address of the I/O accesses,
   updates their state to PROCESSING, and notifies the corresponding
   client to handle.
e) The notified client handles the assigned I/O requests.
f) The HSM updates I/O requests states to COMPLETE and notifies the
   hypervisor of the completion via hypercalls.

Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-10-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  82 +++++++
 drivers/virt/acrn/hsm.c       |  43 +++-
 drivers/virt/acrn/hypercall.h |  28 +++
 drivers/virt/acrn/ioreq.c     | 521 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |  27 ++-
 include/uapi/linux/acrn.h     | 150 ++++++++++++
 7 files changed, 843 insertions(+), 10 deletions(-)
 create mode 100644 drivers/virt/acrn/ioreq.c

(limited to 'include')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 38bc44b6edcd..21721cbf6a80 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o
+acrn-y := hsm.o vm.o mm.o ioreq.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e47a45280eea..68bd8db6c8be 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -12,10 +12,15 @@
 
 extern struct miscdevice acrn_dev;
 
+#define ACRN_NAME_LEN		16
 #define ACRN_MEM_MAPPING_MAX	256
 
 #define ACRN_MEM_REGION_ADD	0
 #define ACRN_MEM_REGION_DEL	2
+
+struct acrn_vm;
+struct acrn_ioreq_client;
+
 /**
  * struct vm_memory_region_op - Hypervisor memory operation
  * @type:		Operation type (ACRN_MEM_REGION_*)
@@ -75,9 +80,63 @@ struct vm_memory_mapping {
 	size_t		size;
 };
 
+/**
+ * struct acrn_ioreq_buffer - Data for setting the ioreq buffer of User VM
+ * @ioreq_buf:	The GPA of the IO request shared buffer of a VM
+ *
+ * The parameter for the HC_SET_IOREQ_BUFFER hypercall used to set up
+ * the shared I/O request buffer between Service VM and ACRN hypervisor.
+ */
+struct acrn_ioreq_buffer {
+	u64	ioreq_buf;
+};
+
+struct acrn_ioreq_range {
+	struct list_head	list;
+	u32			type;
+	u64			start;
+	u64			end;
+};
+
+#define ACRN_IOREQ_CLIENT_DESTROYING	0U
+typedef	int (*ioreq_handler_t)(struct acrn_ioreq_client *client,
+			       struct acrn_io_request *req);
+/**
+ * struct acrn_ioreq_client - Structure of I/O client.
+ * @name:	Client name
+ * @vm:		The VM that the client belongs to
+ * @list:	List node for this acrn_ioreq_client
+ * @is_default:	If this client is the default one
+ * @flags:	Flags (ACRN_IOREQ_CLIENT_*)
+ * @range_list:	I/O ranges
+ * @range_lock:	Lock to protect range_list
+ * @ioreqs_map:	The pending I/O requests bitmap.
+ * @handler:	I/O requests handler of this client
+ * @thread:	The thread which executes the handler
+ * @wq:		The wait queue for the handler thread parking
+ * @priv:	Data for the thread
+ */
+struct acrn_ioreq_client {
+	char			name[ACRN_NAME_LEN];
+	struct acrn_vm		*vm;
+	struct list_head	list;
+	bool			is_default;
+	unsigned long		flags;
+	struct list_head	range_list;
+	rwlock_t		range_lock;
+	DECLARE_BITMAP(ioreqs_map, ACRN_IO_REQUEST_MAX);
+	ioreq_handler_t		handler;
+	struct task_struct	*thread;
+	wait_queue_head_t	wq;
+	void			*priv;
+};
+
 #define ACRN_INVALID_VMID (0xffffU)
 
 #define ACRN_VM_FLAG_DESTROYED		0U
+#define ACRN_VM_FLAG_CLEARING_IOREQ	1U
+extern struct list_head acrn_vm_list;
+extern rwlock_t acrn_vm_list_lock;
 /**
  * struct acrn_vm - Properties of ACRN User VM.
  * @list:			Entry within global list of all VMs.
@@ -90,6 +149,11 @@ struct vm_memory_mapping {
  *				&acrn_vm.regions_mapping_count.
  * @regions_mapping:		Memory mappings of this VM.
  * @regions_mapping_count:	Number of memory mapping of this VM.
+ * @ioreq_clients_lock:		Lock to protect ioreq_clients and default_client
+ * @ioreq_clients:		The I/O request clients list of this VM
+ * @default_client:		The default I/O request client
+ * @ioreq_buf:			I/O request shared buffer
+ * @ioreq_page:			The page of the I/O request shared buffer
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -99,6 +163,11 @@ struct acrn_vm {
 	struct mutex			regions_mapping_lock;
 	struct vm_memory_mapping	regions_mapping[ACRN_MEM_MAPPING_MAX];
 	int				regions_mapping_count;
+	spinlock_t			ioreq_clients_lock;
+	struct list_head		ioreq_clients;
+	struct acrn_ioreq_client	*default_client;
+	struct acrn_io_request_buffer	*ioreq_buf;
+	struct page			*ioreq_page;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -112,4 +181,17 @@ int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
 int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap);
 void acrn_vm_all_ram_unmap(struct acrn_vm *vm);
 
+int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma);
+void acrn_ioreq_deinit(struct acrn_vm *vm);
+int acrn_ioreq_intr_setup(void);
+void acrn_ioreq_intr_remove(void);
+void acrn_ioreq_request_clear(struct acrn_vm *vm);
+int acrn_ioreq_client_wait(struct acrn_ioreq_client *client);
+int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu);
+struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
+						   ioreq_handler_t handler,
+						   void *data, bool is_default,
+						   const char *name);
+void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 2c40d3dc5e94..1cc0c612dc09 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -48,6 +48,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm *vm = filp->private_data;
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
+	struct acrn_ioreq_notify notify;
 	struct acrn_vm_memmap memmap;
 	int i, ret = 0;
 
@@ -147,6 +148,35 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_vm_memseg_unmap(vm, &memmap);
 		break;
+	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
+		if (vm->default_client)
+			return -EEXIST;
+		if (!acrn_ioreq_client_create(vm, NULL, NULL, true, "acrndm"))
+			ret = -EINVAL;
+		break;
+	case ACRN_IOCTL_DESTROY_IOREQ_CLIENT:
+		if (vm->default_client)
+			acrn_ioreq_client_destroy(vm->default_client);
+		break;
+	case ACRN_IOCTL_ATTACH_IOREQ_CLIENT:
+		if (vm->default_client)
+			ret = acrn_ioreq_client_wait(vm->default_client);
+		else
+			ret = -ENODEV;
+		break;
+	case ACRN_IOCTL_NOTIFY_REQUEST_FINISH:
+		if (copy_from_user(&notify, (void __user *)ioctl_param,
+				   sizeof(struct acrn_ioreq_notify)))
+			return -EFAULT;
+
+		if (notify.reserved != 0)
+			return -EINVAL;
+
+		ret = acrn_ioreq_request_default_complete(vm, notify.vcpu);
+		break;
+	case ACRN_IOCTL_CLEAR_VM_IOREQ:
+		acrn_ioreq_request_clear(vm);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
@@ -188,14 +218,23 @@ static int __init hsm_init(void)
 		return -EPERM;
 
 	ret = misc_register(&acrn_dev);
-	if (ret)
+	if (ret) {
 		pr_err("Create misc dev failed!\n");
+		return ret;
+	}
 
-	return ret;
+	ret = acrn_ioreq_intr_setup();
+	if (ret) {
+		pr_err("Setup I/O request handler failed!\n");
+		misc_deregister(&acrn_dev);
+		return ret;
+	}
+	return 0;
 }
 
 static void __exit hsm_exit(void)
 {
+	acrn_ioreq_intr_remove();
 	misc_deregister(&acrn_dev);
 }
 module_init(hsm_init);
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index a1a70a071713..5eba29e3ed38 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,10 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_IOREQ_BASE		0x30UL
+#define HC_SET_IOREQ_BUFFER		_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x00)
+#define HC_NOTIFY_REQUEST_FINISH	_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x01)
+
 #define HC_ID_MEM_BASE			0x40UL
 #define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
 
@@ -91,6 +95,30 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_set_ioreq_buffer() - Set up the shared buffer for I/O Requests.
+ * @vmid:	User VM ID
+ * @buffer:	Service VM GPA of the shared buffer
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_ioreq_buffer(u64 vmid, u64 buffer)
+{
+	return acrn_hypercall2(HC_SET_IOREQ_BUFFER, vmid, buffer);
+}
+
+/**
+ * hcall_notify_req_finish() - Notify ACRN Hypervisor of I/O request completion.
+ * @vmid:	User VM ID
+ * @vcpu:	The vCPU which initiated the I/O request
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_notify_req_finish(u64 vmid, u64 vcpu)
+{
+	return acrn_hypercall2(HC_NOTIFY_REQUEST_FINISH, vmid, vcpu);
+}
+
 /**
  * hcall_set_memory_regions() - Inform the hypervisor to set up EPT mappings
  * @regions_pa:	Service VM GPA of &struct vm_memory_region_batch
diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c
new file mode 100644
index 000000000000..51cb41ef7c72
--- /dev/null
+++ b/drivers/virt/acrn/ioreq.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN_HSM: Handle I/O requests
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Jason Chen CJ <jason.cj.chen@intel.com>
+ *	Fengwei Yin <fengwei.yin@intel.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/acrn.h>
+
+#include "acrn_drv.h"
+
+static void ioreq_pause(void);
+static void ioreq_resume(void);
+
+static void ioreq_dispatcher(struct work_struct *work);
+static struct workqueue_struct *ioreq_wq;
+static DECLARE_WORK(ioreq_work, ioreq_dispatcher);
+
+static inline bool has_pending_request(struct acrn_ioreq_client *client)
+{
+	return !bitmap_empty(client->ioreqs_map, ACRN_IO_REQUEST_MAX);
+}
+
+static inline bool is_destroying(struct acrn_ioreq_client *client)
+{
+	return test_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags);
+}
+
+static int ioreq_complete_request(struct acrn_vm *vm, u16 vcpu,
+				  struct acrn_io_request *acrn_req)
+{
+	bool polling_mode;
+	int ret = 0;
+
+	polling_mode = acrn_req->completion_polling;
+	/* Add barrier() to make sure the writes are done before completion */
+	smp_store_release(&acrn_req->processed, ACRN_IOREQ_STATE_COMPLETE);
+
+	/*
+	 * To fulfill the requirement of real-time in several industry
+	 * scenarios, like automotive, ACRN can run under the partition mode,
+	 * in which User VMs and Service VM are bound to dedicated CPU cores.
+	 * Polling mode of handling the I/O request is introduced to achieve a
+	 * faster I/O request handling. In polling mode, the hypervisor polls
+	 * I/O request's completion. Once an I/O request is marked as
+	 * ACRN_IOREQ_STATE_COMPLETE, hypervisor resumes from the polling point
+	 * to continue the I/O request flow. Thus, the completion notification
+	 * from HSM of I/O request is not needed.  Please note,
+	 * completion_polling needs to be read before the I/O request being
+	 * marked as ACRN_IOREQ_STATE_COMPLETE to avoid racing with the
+	 * hypervisor.
+	 */
+	if (!polling_mode) {
+		ret = hcall_notify_req_finish(vm->vmid, vcpu);
+		if (ret < 0)
+			dev_err(acrn_dev.this_device,
+				"Notify I/O request finished failed!\n");
+	}
+
+	return ret;
+}
+
+static int acrn_ioreq_complete_request(struct acrn_ioreq_client *client,
+				       u16 vcpu,
+				       struct acrn_io_request *acrn_req)
+{
+	int ret;
+
+	if (vcpu >= client->vm->vcpu_num)
+		return -EINVAL;
+
+	clear_bit(vcpu, client->ioreqs_map);
+	if (!acrn_req) {
+		acrn_req = (struct acrn_io_request *)client->vm->ioreq_buf;
+		acrn_req += vcpu;
+	}
+
+	ret = ioreq_complete_request(client->vm, vcpu, acrn_req);
+
+	return ret;
+}
+
+int acrn_ioreq_request_default_complete(struct acrn_vm *vm, u16 vcpu)
+{
+	int ret = 0;
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (vm->default_client)
+		ret = acrn_ioreq_complete_request(vm->default_client,
+						  vcpu, NULL);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	return ret;
+}
+
+/*
+ * ioreq_task() is the execution entity of handler thread of an I/O client.
+ * The handler callback of the I/O client is called within the handler thread.
+ */
+static int ioreq_task(void *data)
+{
+	struct acrn_ioreq_client *client = data;
+	struct acrn_io_request *req;
+	unsigned long *ioreqs_map;
+	int vcpu, ret;
+
+	/*
+	 * Lockless access to ioreqs_map is safe, because
+	 * 1) set_bit() and clear_bit() are atomic operations.
+	 * 2) I/O requests arrives serialized. The access flow of ioreqs_map is:
+	 *	set_bit() - in ioreq_work handler
+	 *	Handler callback handles corresponding I/O request
+	 *	clear_bit() - in handler thread (include ACRN userspace)
+	 *	Mark corresponding I/O request completed
+	 *	Loop again if a new I/O request occurs
+	 */
+	ioreqs_map = client->ioreqs_map;
+	while (!kthread_should_stop()) {
+		acrn_ioreq_client_wait(client);
+		while (has_pending_request(client)) {
+			vcpu = find_first_bit(ioreqs_map, client->vm->vcpu_num);
+			req = client->vm->ioreq_buf->req_slot + vcpu;
+			ret = client->handler(client, req);
+			if (ret < 0) {
+				dev_err(acrn_dev.this_device,
+					"IO handle failure: %d\n", ret);
+				break;
+			}
+			acrn_ioreq_complete_request(client, vcpu, req);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * For the non-default I/O clients, give them chance to complete the current
+ * I/O requests if there are any. For the default I/O client, it is safe to
+ * clear all pending I/O requests because the clearing request is from ACRN
+ * userspace.
+ */
+void acrn_ioreq_request_clear(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client;
+	bool has_pending = false;
+	unsigned long vcpu;
+	int retry = 10;
+
+	/*
+	 * IO requests of this VM will be completed directly in
+	 * acrn_ioreq_dispatch if ACRN_VM_FLAG_CLEARING_IOREQ flag is set.
+	 */
+	set_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags);
+
+	/*
+	 * acrn_ioreq_request_clear is only called in VM reset case. Simply
+	 * wait 100ms in total for the IO requests' completion.
+	 */
+	do {
+		spin_lock_bh(&vm->ioreq_clients_lock);
+		list_for_each_entry(client, &vm->ioreq_clients, list) {
+			has_pending = has_pending_request(client);
+			if (has_pending)
+				break;
+		}
+		spin_unlock_bh(&vm->ioreq_clients_lock);
+
+		if (has_pending)
+			schedule_timeout_interruptible(HZ / 100);
+	} while (has_pending && --retry > 0);
+	if (retry == 0)
+		dev_warn(acrn_dev.this_device,
+			 "%s cannot flush pending request!\n", client->name);
+
+	/* Clear all ioreqs belonging to the default client */
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	client = vm->default_client;
+	if (client) {
+		vcpu = find_next_bit(client->ioreqs_map,
+				     ACRN_IO_REQUEST_MAX, 0);
+		while (vcpu < ACRN_IO_REQUEST_MAX) {
+			acrn_ioreq_complete_request(client, vcpu, NULL);
+			vcpu = find_next_bit(client->ioreqs_map,
+					     ACRN_IO_REQUEST_MAX, vcpu + 1);
+		}
+	}
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	/* Clear ACRN_VM_FLAG_CLEARING_IOREQ flag after the clearing */
+	clear_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags);
+}
+
+int acrn_ioreq_client_wait(struct acrn_ioreq_client *client)
+{
+	if (client->is_default) {
+		/*
+		 * In the default client, a user space thread waits on the
+		 * waitqueue. The is_destroying() check is used to notify user
+		 * space the client is going to be destroyed.
+		 */
+		wait_event_interruptible(client->wq,
+					 has_pending_request(client) ||
+					 is_destroying(client));
+		if (is_destroying(client))
+			return -ENODEV;
+	} else {
+		wait_event_interruptible(client->wq,
+					 has_pending_request(client) ||
+					 kthread_should_stop());
+	}
+
+	return 0;
+}
+
+static bool in_range(struct acrn_ioreq_range *range,
+		     struct acrn_io_request *req)
+{
+	bool ret = false;
+
+	if (range->type == req->type) {
+		switch (req->type) {
+		case ACRN_IOREQ_TYPE_MMIO:
+			if (req->reqs.mmio_request.address >= range->start &&
+			    (req->reqs.mmio_request.address +
+			     req->reqs.mmio_request.size - 1) <= range->end)
+				ret = true;
+			break;
+		case ACRN_IOREQ_TYPE_PORTIO:
+			if (req->reqs.pio_request.address >= range->start &&
+			    (req->reqs.pio_request.address +
+			     req->reqs.pio_request.size - 1) <= range->end)
+				ret = true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static struct acrn_ioreq_client *find_ioreq_client(struct acrn_vm *vm,
+						   struct acrn_io_request *req)
+{
+	struct acrn_ioreq_client *client, *found = NULL;
+	struct acrn_ioreq_range *range;
+
+	lockdep_assert_held(&vm->ioreq_clients_lock);
+
+	list_for_each_entry(client, &vm->ioreq_clients, list) {
+		read_lock_bh(&client->range_lock);
+		list_for_each_entry(range, &client->range_list, list) {
+			if (in_range(range, req)) {
+				found = client;
+				break;
+			}
+		}
+		read_unlock_bh(&client->range_lock);
+		if (found)
+			break;
+	}
+	return found ? found : vm->default_client;
+}
+
+/**
+ * acrn_ioreq_client_create() - Create an ioreq client
+ * @vm:		The VM that this client belongs to
+ * @handler:	The ioreq_handler of ioreq client acrn_hsm will create a kernel
+ *		thread and call the handler to handle I/O requests.
+ * @priv:	Private data for the handler
+ * @is_default:	If it is the default client
+ * @name:	The name of ioreq client
+ *
+ * Return: acrn_ioreq_client pointer on success, NULL on error
+ */
+struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
+						   ioreq_handler_t handler,
+						   void *priv, bool is_default,
+						   const char *name)
+{
+	struct acrn_ioreq_client *client;
+
+	if (!handler && !is_default) {
+		dev_dbg(acrn_dev.this_device,
+			"Cannot create non-default client w/o handler!\n");
+		return NULL;
+	}
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	client->handler = handler;
+	client->vm = vm;
+	client->priv = priv;
+	client->is_default = is_default;
+	if (name)
+		strncpy(client->name, name, sizeof(client->name) - 1);
+	rwlock_init(&client->range_lock);
+	INIT_LIST_HEAD(&client->range_list);
+	init_waitqueue_head(&client->wq);
+
+	if (client->handler) {
+		client->thread = kthread_run(ioreq_task, client, "VM%u-%s",
+					     client->vm->vmid, client->name);
+		if (IS_ERR(client->thread)) {
+			kfree(client);
+			return NULL;
+		}
+	}
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (is_default)
+		vm->default_client = client;
+	else
+		list_add(&client->list, &vm->ioreq_clients);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	dev_dbg(acrn_dev.this_device, "Created ioreq client %s.\n", name);
+	return client;
+}
+
+/**
+ * acrn_ioreq_client_destroy() - Destroy an ioreq client
+ * @client:	The ioreq client
+ */
+void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client)
+{
+	struct acrn_ioreq_range *range, *next;
+	struct acrn_vm *vm = client->vm;
+
+	dev_dbg(acrn_dev.this_device,
+		"Destroy ioreq client %s.\n", client->name);
+	ioreq_pause();
+	set_bit(ACRN_IOREQ_CLIENT_DESTROYING, &client->flags);
+	if (client->is_default)
+		wake_up_interruptible(&client->wq);
+	else
+		kthread_stop(client->thread);
+
+	spin_lock_bh(&vm->ioreq_clients_lock);
+	if (client->is_default)
+		vm->default_client = NULL;
+	else
+		list_del(&client->list);
+	spin_unlock_bh(&vm->ioreq_clients_lock);
+
+	write_lock_bh(&client->range_lock);
+	list_for_each_entry_safe(range, next, &client->range_list, list) {
+		list_del(&range->list);
+		kfree(range);
+	}
+	write_unlock_bh(&client->range_lock);
+	kfree(client);
+
+	ioreq_resume();
+}
+
+static int acrn_ioreq_dispatch(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client;
+	struct acrn_io_request *req;
+	int i;
+
+	for (i = 0; i < vm->vcpu_num; i++) {
+		req = vm->ioreq_buf->req_slot + i;
+
+		/* barrier the read of processed of acrn_io_request */
+		if (smp_load_acquire(&req->processed) ==
+				     ACRN_IOREQ_STATE_PENDING) {
+			/* Complete the IO request directly in clearing stage */
+			if (test_bit(ACRN_VM_FLAG_CLEARING_IOREQ, &vm->flags)) {
+				ioreq_complete_request(vm, i, req);
+				continue;
+			}
+
+			spin_lock_bh(&vm->ioreq_clients_lock);
+			client = find_ioreq_client(vm, req);
+			if (!client) {
+				dev_err(acrn_dev.this_device,
+					"Failed to find ioreq client!\n");
+				spin_unlock_bh(&vm->ioreq_clients_lock);
+				return -EINVAL;
+			}
+			if (!client->is_default)
+				req->kernel_handled = 1;
+			else
+				req->kernel_handled = 0;
+			/*
+			 * Add barrier() to make sure the writes are done
+			 * before setting ACRN_IOREQ_STATE_PROCESSING
+			 */
+			smp_store_release(&req->processed,
+					  ACRN_IOREQ_STATE_PROCESSING);
+			set_bit(i, client->ioreqs_map);
+			wake_up_interruptible(&client->wq);
+			spin_unlock_bh(&vm->ioreq_clients_lock);
+		}
+	}
+
+	return 0;
+}
+
+static void ioreq_dispatcher(struct work_struct *work)
+{
+	struct acrn_vm *vm;
+
+	read_lock(&acrn_vm_list_lock);
+	list_for_each_entry(vm, &acrn_vm_list, list) {
+		if (!vm->ioreq_buf)
+			break;
+		acrn_ioreq_dispatch(vm);
+	}
+	read_unlock(&acrn_vm_list_lock);
+}
+
+static void ioreq_intr_handler(void)
+{
+	queue_work(ioreq_wq, &ioreq_work);
+}
+
+static void ioreq_pause(void)
+{
+	/* Flush and unarm the handler to ensure no I/O requests pending */
+	acrn_remove_intr_handler();
+	drain_workqueue(ioreq_wq);
+}
+
+static void ioreq_resume(void)
+{
+	/* Schedule after enabling in case other clients miss interrupt */
+	acrn_setup_intr_handler(ioreq_intr_handler);
+	queue_work(ioreq_wq, &ioreq_work);
+}
+
+int acrn_ioreq_intr_setup(void)
+{
+	acrn_setup_intr_handler(ioreq_intr_handler);
+	ioreq_wq = alloc_workqueue("ioreq_wq",
+				   WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!ioreq_wq) {
+		dev_err(acrn_dev.this_device, "Failed to alloc workqueue!\n");
+		acrn_remove_intr_handler();
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void acrn_ioreq_intr_remove(void)
+{
+	if (ioreq_wq)
+		destroy_workqueue(ioreq_wq);
+	acrn_remove_intr_handler();
+}
+
+int acrn_ioreq_init(struct acrn_vm *vm, u64 buf_vma)
+{
+	struct acrn_ioreq_buffer *set_buffer;
+	struct page *page;
+	int ret;
+
+	if (vm->ioreq_buf)
+		return -EEXIST;
+
+	set_buffer = kzalloc(sizeof(*set_buffer), GFP_KERNEL);
+	if (!set_buffer)
+		return -ENOMEM;
+
+	ret = pin_user_pages_fast(buf_vma, 1,
+				  FOLL_WRITE | FOLL_LONGTERM, &page);
+	if (unlikely(ret != 1) || !page) {
+		dev_err(acrn_dev.this_device, "Failed to pin ioreq page!\n");
+		ret = -EFAULT;
+		goto free_buf;
+	}
+
+	vm->ioreq_buf = page_address(page);
+	vm->ioreq_page = page;
+	set_buffer->ioreq_buf = page_to_phys(page);
+	ret = hcall_set_ioreq_buffer(vm->vmid, virt_to_phys(set_buffer));
+	if (ret < 0) {
+		dev_err(acrn_dev.this_device, "Failed to init ioreq buffer!\n");
+		unpin_user_page(page);
+		vm->ioreq_buf = NULL;
+		goto free_buf;
+	}
+
+	dev_dbg(acrn_dev.this_device,
+		"Init ioreq buffer %pK!\n", vm->ioreq_buf);
+	ret = 0;
+free_buf:
+	kfree(set_buffer);
+	return ret;
+}
+
+void acrn_ioreq_deinit(struct acrn_vm *vm)
+{
+	struct acrn_ioreq_client *client, *next;
+
+	dev_dbg(acrn_dev.this_device,
+		"Deinit ioreq buffer %pK!\n", vm->ioreq_buf);
+	/* Destroy all clients belonging to this VM */
+	list_for_each_entry_safe(client, next, &vm->ioreq_clients, list)
+		acrn_ioreq_client_destroy(client);
+	if (vm->default_client)
+		acrn_ioreq_client_destroy(vm->default_client);
+
+	if (vm->ioreq_buf && vm->ioreq_page) {
+		unpin_user_page(vm->ioreq_page);
+		vm->ioreq_buf = NULL;
+	}
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index ff5df7acb551..2bc649c7c1d3 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -15,9 +15,12 @@
 #include "acrn_drv.h"
 
 /* List of VMs */
-static LIST_HEAD(acrn_vm_list);
-/* To protect acrn_vm_list */
-static DEFINE_MUTEX(acrn_vm_list_lock);
+LIST_HEAD(acrn_vm_list);
+/*
+ * acrn_vm_list is read in a worker thread which dispatch I/O requests and
+ * is wrote in VM creation ioctl. Use the rwlock mechanism to protect it.
+ */
+DEFINE_RWLOCK(acrn_vm_list_lock);
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 			       struct acrn_vm_creation *vm_param)
@@ -32,12 +35,20 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	}
 
 	mutex_init(&vm->regions_mapping_lock);
+	INIT_LIST_HEAD(&vm->ioreq_clients);
+	spin_lock_init(&vm->ioreq_clients_lock);
 	vm->vmid = vm_param->vmid;
 	vm->vcpu_num = vm_param->vcpu_num;
 
-	mutex_lock(&acrn_vm_list_lock);
+	if (acrn_ioreq_init(vm, vm_param->ioreq_buf) < 0) {
+		hcall_destroy_vm(vm_param->vmid);
+		vm->vmid = ACRN_INVALID_VMID;
+		return NULL;
+	}
+
+	write_lock_bh(&acrn_vm_list_lock);
 	list_add(&vm->list, &acrn_vm_list);
-	mutex_unlock(&acrn_vm_list_lock);
+	write_unlock_bh(&acrn_vm_list_lock);
 
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
@@ -52,9 +63,11 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 		return 0;
 
 	/* Remove from global VM list */
-	mutex_lock(&acrn_vm_list_lock);
+	write_lock_bh(&acrn_vm_list_lock);
 	list_del_init(&vm->list);
-	mutex_unlock(&acrn_vm_list_lock);
+	write_unlock_bh(&acrn_vm_list_lock);
+
+	acrn_ioreq_deinit(vm);
 
 	ret = hcall_destroy_vm(vm->vmid);
 	if (ret < 0) {
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index ec4c61e7c170..b0f73ab5e4e8 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -14,6 +14,145 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
+#define ACRN_IO_REQUEST_MAX		16
+
+#define ACRN_IOREQ_STATE_PENDING	0
+#define ACRN_IOREQ_STATE_COMPLETE	1
+#define ACRN_IOREQ_STATE_PROCESSING	2
+#define ACRN_IOREQ_STATE_FREE		3
+
+#define ACRN_IOREQ_TYPE_PORTIO		0
+#define ACRN_IOREQ_TYPE_MMIO		1
+
+#define ACRN_IOREQ_DIR_READ		0
+#define ACRN_IOREQ_DIR_WRITE		1
+
+/**
+ * struct acrn_mmio_request - Info of a MMIO I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @address:	Access address of this MMIO I/O request
+ * @size:	Access size of this MMIO I/O request
+ * @value:	Read/write value of this MMIO I/O request
+ */
+struct acrn_mmio_request {
+	__u32	direction;
+	__u32	reserved;
+	__u64	address;
+	__u64	size;
+	__u64	value;
+};
+
+/**
+ * struct acrn_pio_request - Info of a PIO I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @address:	Access address of this PIO I/O request
+ * @size:	Access size of this PIO I/O request
+ * @value:	Read/write value of this PIO I/O request
+ */
+struct acrn_pio_request {
+	__u32	direction;
+	__u32	reserved;
+	__u64	address;
+	__u64	size;
+	__u32	value;
+};
+
+/**
+ * struct acrn_io_request - 256-byte ACRN I/O request
+ * @type:		Type of this request (ACRN_IOREQ_TYPE_*).
+ * @completion_polling:	Polling flag. Hypervisor will poll completion of the
+ *			I/O request if this flag set.
+ * @reserved0:		Reserved fields.
+ * @reqs:		Union of different types of request. Byte offset: 64.
+ * @reqs.pio_request:	PIO request data of the I/O request.
+ * @reqs.mmio_request:	MMIO request data of the I/O request.
+ * @reqs.data:		Raw data of the I/O request.
+ * @reserved1:		Reserved fields.
+ * @kernel_handled:	Flag indicates this request need be handled in kernel.
+ * @processed:		The status of this request (ACRN_IOREQ_STATE_*).
+ *
+ * The state transitions of ACRN I/O request:
+ *
+ *    FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ...
+ *
+ * An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM and
+ * ACRN userspace are in charge of processing the others.
+ *
+ * On basis of the states illustrated above, a typical lifecycle of ACRN IO
+ * request would look like:
+ *
+ * Flow                 (assume the initial state is FREE)
+ * |
+ * |   Service VM vCPU 0     Service VM vCPU x      User vCPU y
+ * |
+ * |                                             hypervisor:
+ * |                                               fills in type, addr, etc.
+ * |                                               pauses the User VM vCPU y
+ * |                                               sets the state to PENDING (a)
+ * |                                               fires an upcall to Service VM
+ * |
+ * | HSM:
+ * |  scans for PENDING requests
+ * |  sets the states to PROCESSING (b)
+ * |  assigns the requests to clients (c)
+ * V
+ * |                     client:
+ * |                       scans for the assigned requests
+ * |                       handles the requests (d)
+ * |                     HSM:
+ * |                       sets states to COMPLETE
+ * |                       notifies the hypervisor
+ * |
+ * |                     hypervisor:
+ * |                       resumes User VM vCPU y (e)
+ * |
+ * |                                             hypervisor:
+ * |                                               post handling (f)
+ * V                                               sets states to FREE
+ *
+ * Note that the procedures (a) to (f) in the illustration above require to be
+ * strictly processed in the order.  One vCPU cannot trigger another request of
+ * I/O emulation before completing the previous one.
+ *
+ * Atomic and barriers are required when HSM and hypervisor accessing the state
+ * of &struct acrn_io_request.
+ *
+ */
+struct acrn_io_request {
+	__u32	type;
+	__u32	completion_polling;
+	__u32	reserved0[14];
+	union {
+		struct acrn_pio_request		pio_request;
+		struct acrn_mmio_request	mmio_request;
+		__u64				data[8];
+	} reqs;
+	__u32	reserved1;
+	__u32	kernel_handled;
+	__u32	processed;
+} __attribute__((aligned(256)));
+
+struct acrn_io_request_buffer {
+	union {
+		struct acrn_io_request	req_slot[ACRN_IO_REQUEST_MAX];
+		__u8			reserved[4096];
+	};
+};
+
+/**
+ * struct acrn_ioreq_notify - The structure of ioreq completion notification
+ * @vmid:	User VM ID
+ * @reserved:	Reserved and should be 0
+ * @vcpu:	vCPU ID
+ */
+struct acrn_ioreq_notify {
+	__u16	vmid;
+	__u16	reserved;
+	__u32	vcpu;
+};
+
 /**
  * struct acrn_vm_creation - Info to create a User VM
  * @vmid:		User VM ID returned from the hypervisor
@@ -218,6 +357,17 @@ struct acrn_vm_memmap {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \
+	_IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify)
+#define ACRN_IOCTL_CREATE_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x32)
+#define ACRN_IOCTL_ATTACH_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x33)
+#define ACRN_IOCTL_DESTROY_IOREQ_CLIENT	\
+	_IO(ACRN_IOCTL_TYPE, 0x34)
+#define ACRN_IOCTL_CLEAR_VM_IOREQ	\
+	_IO(ACRN_IOCTL_TYPE, 0x35)
+
 #define ACRN_IOCTL_SET_MEMSEG		\
 	_IOW(ACRN_IOCTL_TYPE, 0x41, struct acrn_vm_memmap)
 #define ACRN_IOCTL_UNSET_MEMSEG		\
-- 
cgit v1.2.3


From 3c4c331667d4d9f1b5f3fdff9c4db36776da30ae Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:32 +0800
Subject: virt: acrn: Introduce PCI configuration space PIO accesses combiner

A User VM can access its virtual PCI configuration spaces via port IO
approach, which has two following steps:
 1) writes address into port 0xCF8
 2) put/get data in/from port 0xCFC

To distribute a complete PCI configuration space access one time, HSM
need to combine such two accesses together.

Combine two paired PIO I/O requests into one PCI I/O request and
continue the I/O request distribution.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-11-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/acrn_drv.h |  2 ++
 drivers/virt/acrn/ioreq.c    | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h    | 27 ++++++++++++++++
 3 files changed, 105 insertions(+)

(limited to 'include')

diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 68bd8db6c8be..6c92a505fa20 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -154,6 +154,7 @@ extern rwlock_t acrn_vm_list_lock;
  * @default_client:		The default I/O request client
  * @ioreq_buf:			I/O request shared buffer
  * @ioreq_page:			The page of the I/O request shared buffer
+ * @pci_conf_addr:		Address of a PCI configuration access emulation
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -168,6 +169,7 @@ struct acrn_vm {
 	struct acrn_ioreq_client	*default_client;
 	struct acrn_io_request_buffer	*ioreq_buf;
 	struct page			*ioreq_page;
+	u32				pci_conf_addr;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c
index 51cb41ef7c72..d19c05582d38 100644
--- a/drivers/virt/acrn/ioreq.c
+++ b/drivers/virt/acrn/ioreq.c
@@ -222,6 +222,80 @@ int acrn_ioreq_client_wait(struct acrn_ioreq_client *client)
 	return 0;
 }
 
+static bool is_cfg_addr(struct acrn_io_request *req)
+{
+	return ((req->type == ACRN_IOREQ_TYPE_PORTIO) &&
+		(req->reqs.pio_request.address == 0xcf8));
+}
+
+static bool is_cfg_data(struct acrn_io_request *req)
+{
+	return ((req->type == ACRN_IOREQ_TYPE_PORTIO) &&
+		((req->reqs.pio_request.address >= 0xcfc) &&
+		 (req->reqs.pio_request.address < (0xcfc + 4))));
+}
+
+/* The low 8-bit of supported pci_reg addr.*/
+#define PCI_LOWREG_MASK  0xFC
+/* The high 4-bit of supported pci_reg addr */
+#define PCI_HIGHREG_MASK 0xF00
+/* Max number of supported functions */
+#define PCI_FUNCMAX	7
+/* Max number of supported slots */
+#define PCI_SLOTMAX	31
+/* Max number of supported buses */
+#define PCI_BUSMAX	255
+#define CONF1_ENABLE	0x80000000UL
+/*
+ * A PCI configuration space access via PIO 0xCF8 and 0xCFC normally has two
+ * following steps:
+ *   1) writes address into 0xCF8 port
+ *   2) accesses data in/from 0xCFC
+ * This function combines such paired PCI configuration space I/O requests into
+ * one ACRN_IOREQ_TYPE_PCICFG type I/O request and continues the processing.
+ */
+static bool handle_cf8cfc(struct acrn_vm *vm,
+			  struct acrn_io_request *req, u16 vcpu)
+{
+	int offset, pci_cfg_addr, pci_reg;
+	bool is_handled = false;
+
+	if (is_cfg_addr(req)) {
+		WARN_ON(req->reqs.pio_request.size != 4);
+		if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_WRITE)
+			vm->pci_conf_addr = req->reqs.pio_request.value;
+		else
+			req->reqs.pio_request.value = vm->pci_conf_addr;
+		is_handled = true;
+	} else if (is_cfg_data(req)) {
+		if (!(vm->pci_conf_addr & CONF1_ENABLE)) {
+			if (req->reqs.pio_request.direction ==
+					ACRN_IOREQ_DIR_READ)
+				req->reqs.pio_request.value = 0xffffffff;
+			is_handled = true;
+		} else {
+			offset = req->reqs.pio_request.address - 0xcfc;
+
+			req->type = ACRN_IOREQ_TYPE_PCICFG;
+			pci_cfg_addr = vm->pci_conf_addr;
+			req->reqs.pci_request.bus =
+					(pci_cfg_addr >> 16) & PCI_BUSMAX;
+			req->reqs.pci_request.dev =
+					(pci_cfg_addr >> 11) & PCI_SLOTMAX;
+			req->reqs.pci_request.func =
+					(pci_cfg_addr >> 8) & PCI_FUNCMAX;
+			pci_reg = (pci_cfg_addr & PCI_LOWREG_MASK) +
+				   ((pci_cfg_addr >> 16) & PCI_HIGHREG_MASK);
+			req->reqs.pci_request.reg = pci_reg + offset;
+		}
+	}
+
+	if (is_handled)
+		ioreq_complete_request(vm, vcpu, req);
+
+	return is_handled;
+}
+
 static bool in_range(struct acrn_ioreq_range *range,
 		     struct acrn_io_request *req)
 {
@@ -382,6 +456,8 @@ static int acrn_ioreq_dispatch(struct acrn_vm *vm)
 				ioreq_complete_request(vm, i, req);
 				continue;
 			}
+			if (handle_cf8cfc(vm, req, i))
+				continue;
 
 			spin_lock_bh(&vm->ioreq_clients_lock);
 			client = find_ioreq_client(vm, req);
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b0f73ab5e4e8..da40f7ad13d9 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -23,6 +23,7 @@
 
 #define ACRN_IOREQ_TYPE_PORTIO		0
 #define ACRN_IOREQ_TYPE_MMIO		1
+#define ACRN_IOREQ_TYPE_PCICFG		2
 
 #define ACRN_IOREQ_DIR_READ		0
 #define ACRN_IOREQ_DIR_WRITE		1
@@ -59,6 +60,30 @@ struct acrn_pio_request {
 	__u32	value;
 };
 
+/**
+ * struct acrn_pci_request - Info of a PCI I/O request
+ * @direction:	Access direction of this request (ACRN_IOREQ_DIR_*)
+ * @reserved:	Reserved for alignment and should be 0
+ * @size:	Access size of this PCI I/O request
+ * @value:	Read/write value of this PIO I/O request
+ * @bus:	PCI bus value of this PCI I/O request
+ * @dev:	PCI device value of this PCI I/O request
+ * @func:	PCI function value of this PCI I/O request
+ * @reg:	PCI config space offset of this PCI I/O request
+ *
+ * Need keep same header layout with &struct acrn_pio_request.
+ */
+struct acrn_pci_request {
+	__u32	direction;
+	__u32	reserved[3];
+	__u64	size;
+	__u32	value;
+	__u32	bus;
+	__u32	dev;
+	__u32	func;
+	__u32	reg;
+};
+
 /**
  * struct acrn_io_request - 256-byte ACRN I/O request
  * @type:		Type of this request (ACRN_IOREQ_TYPE_*).
@@ -67,6 +92,7 @@ struct acrn_pio_request {
  * @reserved0:		Reserved fields.
  * @reqs:		Union of different types of request. Byte offset: 64.
  * @reqs.pio_request:	PIO request data of the I/O request.
+ * @reqs.pci_request:	PCI configuration space request data of the I/O request.
  * @reqs.mmio_request:	MMIO request data of the I/O request.
  * @reqs.data:		Raw data of the I/O request.
  * @reserved1:		Reserved fields.
@@ -126,6 +152,7 @@ struct acrn_io_request {
 	__u32	reserved0[14];
 	union {
 		struct acrn_pio_request		pio_request;
+		struct acrn_pci_request		pci_request;
 		struct acrn_mmio_request	mmio_request;
 		__u64				data[8];
 	} reqs;
-- 
cgit v1.2.3


From ce011e1363a1fe43de0ca05abc394022ee4fefeb Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:33 +0800
Subject: virt: acrn: Introduce interfaces for PCI device passthrough

PCI device passthrough enables an OS in a virtual machine to directly
access a PCI device in the host. It promises almost the native
performance, which is required in performance-critical scenarios of
ACRN.

HSM provides the following ioctls:
 - Assign - ACRN_IOCTL_ASSIGN_PCIDEV
   Pass data struct acrn_pcidev from userspace to the hypervisor, and
   inform the hypervisor to assign a PCI device to a User VM.

 - De-assign - ACRN_IOCTL_DEASSIGN_PCIDEV
   Pass data struct acrn_pcidev from userspace to the hypervisor, and
   inform the hypervisor to de-assign a PCI device from a User VM.

 - Set a interrupt of a passthrough device - ACRN_IOCTL_SET_PTDEV_INTR
   Pass data struct acrn_ptdev_irq from userspace to the hypervisor,
   and inform the hypervisor to map a INTx interrupt of passthrough
   device of User VM.

 - Reset passthrough device interrupt - ACRN_IOCTL_RESET_PTDEV_INTR
   Pass data struct acrn_ptdev_irq from userspace to the hypervisor,
   and inform the hypervisor to unmap a INTx interrupt of passthrough
   device of User VM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-12-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       | 50 +++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 54 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h     | 61 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+)

(limited to 'include')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 1cc0c612dc09..94d70b1c1e5c 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -49,7 +49,9 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm_creation *vm_param;
 	struct acrn_vcpu_regs *cpu_regs;
 	struct acrn_ioreq_notify notify;
+	struct acrn_ptdev_irq *irq_info;
 	struct acrn_vm_memmap memmap;
+	struct acrn_pcidev *pcidev;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -148,6 +150,54 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_vm_memseg_unmap(vm, &memmap);
 		break;
+	case ACRN_IOCTL_ASSIGN_PCIDEV:
+		pcidev = memdup_user((void __user *)ioctl_param,
+				     sizeof(struct acrn_pcidev));
+		if (IS_ERR(pcidev))
+			return PTR_ERR(pcidev);
+
+		ret = hcall_assign_pcidev(vm->vmid, virt_to_phys(pcidev));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to assign pci device!\n");
+		kfree(pcidev);
+		break;
+	case ACRN_IOCTL_DEASSIGN_PCIDEV:
+		pcidev = memdup_user((void __user *)ioctl_param,
+				     sizeof(struct acrn_pcidev));
+		if (IS_ERR(pcidev))
+			return PTR_ERR(pcidev);
+
+		ret = hcall_deassign_pcidev(vm->vmid, virt_to_phys(pcidev));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to deassign pci device!\n");
+		kfree(pcidev);
+		break;
+	case ACRN_IOCTL_SET_PTDEV_INTR:
+		irq_info = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_ptdev_irq));
+		if (IS_ERR(irq_info))
+			return PTR_ERR(irq_info);
+
+		ret = hcall_set_ptdev_intr(vm->vmid, virt_to_phys(irq_info));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to configure intr for ptdev!\n");
+		kfree(irq_info);
+		break;
+	case ACRN_IOCTL_RESET_PTDEV_INTR:
+		irq_info = memdup_user((void __user *)ioctl_param,
+				       sizeof(struct acrn_ptdev_irq));
+		if (IS_ERR(irq_info))
+			return PTR_ERR(irq_info);
+
+		ret = hcall_reset_ptdev_intr(vm->vmid, virt_to_phys(irq_info));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to reset intr for ptdev!\n");
+		kfree(irq_info);
+		break;
 	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
 		if (vm->default_client)
 			return -EEXIST;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index 5eba29e3ed38..f448301832cf 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -28,6 +28,12 @@
 #define HC_ID_MEM_BASE			0x40UL
 #define HC_VM_SET_MEMORY_REGIONS	_HC_ID(HC_ID, HC_ID_MEM_BASE + 0x02)
 
+#define HC_ID_PCI_BASE			0x50UL
+#define HC_SET_PTDEV_INTR		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x03)
+#define HC_RESET_PTDEV_INTR		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x04)
+#define HC_ASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x05)
+#define HC_DEASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x06)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -130,4 +136,52 @@ static inline long hcall_set_memory_regions(u64 regions_pa)
 	return acrn_hypercall1(HC_VM_SET_MEMORY_REGIONS, regions_pa);
 }
 
+/**
+ * hcall_assign_pcidev() - Assign a PCI device to a User VM
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the &struct acrn_pcidev
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_assign_pcidev(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_ASSIGN_PCIDEV, vmid, addr);
+}
+
+/**
+ * hcall_deassign_pcidev() - De-assign a PCI device from a User VM
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the &struct acrn_pcidev
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_deassign_pcidev(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_DEASSIGN_PCIDEV, vmid, addr);
+}
+
+/**
+ * hcall_set_ptdev_intr() - Configure an interrupt for an assigned PCI device.
+ * @vmid:	User VM ID
+ * @irq:	Service VM GPA of the &struct acrn_ptdev_irq
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_ptdev_intr(u64 vmid, u64 irq)
+{
+	return acrn_hypercall2(HC_SET_PTDEV_INTR, vmid, irq);
+}
+
+/**
+ * hcall_reset_ptdev_intr() - Reset an interrupt for an assigned PCI device.
+ * @vmid:	User VM ID
+ * @irq:	Service VM GPA of the &struct acrn_ptdev_irq
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_reset_ptdev_intr(u64 vmid, u64 irq)
+{
+	return acrn_hypercall2(HC_RESET_PTDEV_INTR, vmid, irq);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index da40f7ad13d9..b25ca8c33b92 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -365,6 +365,58 @@ struct acrn_vm_memmap {
 	__u64	len;
 };
 
+/* Type of interrupt of a passthrough device */
+#define ACRN_PTDEV_IRQ_INTX	0
+#define ACRN_PTDEV_IRQ_MSI	1
+#define ACRN_PTDEV_IRQ_MSIX	2
+/**
+ * struct acrn_ptdev_irq - Interrupt data of a passthrough device.
+ * @type:		Type (ACRN_PTDEV_IRQ_*)
+ * @virt_bdf:		Virtual Bus/Device/Function
+ * @phys_bdf:		Physical Bus/Device/Function
+ * @intx:		Info of interrupt
+ * @intx.virt_pin:	Virtual IOAPIC pin
+ * @intx.phys_pin:	Physical IOAPIC pin
+ * @intx.is_pic_pin:	Is PIC pin or not
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_ptdev_irq {
+	__u32	type;
+	__u16	virt_bdf;
+	__u16	phys_bdf;
+
+	struct {
+		__u32	virt_pin;
+		__u32	phys_pin;
+		__u32	is_pic_pin;
+	} intx;
+};
+
+/* Type of PCI device assignment */
+#define ACRN_PTDEV_QUIRK_ASSIGN	(1U << 0)
+
+#define ACRN_PCI_NUM_BARS	6
+/**
+ * struct acrn_pcidev - Info for assigning or de-assigning a PCI device
+ * @type:	Type of the assignment
+ * @virt_bdf:	Virtual Bus/Device/Function
+ * @phys_bdf:	Physical Bus/Device/Function
+ * @intr_line:	PCI interrupt line
+ * @intr_pin:	PCI interrupt pin
+ * @bar:	PCI BARs.
+ *
+ * This structure will be passed to hypervisor directly.
+ */
+struct acrn_pcidev {
+	__u32	type;
+	__u16	virt_bdf;
+	__u16	phys_bdf;
+	__u8	intr_line;
+	__u8	intr_pin;
+	__u32	bar[ACRN_PCI_NUM_BARS];
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -400,4 +452,13 @@ struct acrn_vm_memmap {
 #define ACRN_IOCTL_UNSET_MEMSEG		\
 	_IOW(ACRN_IOCTL_TYPE, 0x42, struct acrn_vm_memmap)
 
+#define ACRN_IOCTL_SET_PTDEV_INTR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x53, struct acrn_ptdev_irq)
+#define ACRN_IOCTL_RESET_PTDEV_INTR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x54, struct acrn_ptdev_irq)
+#define ACRN_IOCTL_ASSIGN_PCIDEV	\
+	_IOW(ACRN_IOCTL_TYPE, 0x55, struct acrn_pcidev)
+#define ACRN_IOCTL_DEASSIGN_PCIDEV	\
+	_IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From c7cf8d27244f2ccdde30c79eb6314c943bbeac28 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:34 +0800
Subject: virt: acrn: Introduce interrupt injection interfaces

ACRN userspace need to inject virtual interrupts into a User VM in
devices emulation.

HSM needs provide interfaces to do so.

Introduce following interrupt injection interfaces:

ioctl ACRN_IOCTL_SET_IRQLINE:
  Pass data from userspace to the hypervisor, and inform the hypervisor
  to inject a virtual IOAPIC GSI interrupt to a User VM.

ioctl ACRN_IOCTL_INJECT_MSI:
  Pass data struct acrn_msi_entry from userspace to the hypervisor, and
  inform the hypervisor to inject a virtual MSI to a User VM.

ioctl ACRN_IOCTL_VM_INTR_MONITOR:
  Set a 4-Kbyte aligned shared page for statistics information of
  interrupts of a User VM.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-13-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/acrn_drv.h  |  4 ++++
 drivers/virt/acrn/hsm.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 41 +++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        | 36 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/acrn.h     | 17 +++++++++++++++++
 5 files changed, 138 insertions(+)

(limited to 'include')

diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 6c92a505fa20..068f0be769f6 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -155,6 +155,7 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioreq_buf:			I/O request shared buffer
  * @ioreq_page:			The page of the I/O request shared buffer
  * @pci_conf_addr:		Address of a PCI configuration access emulation
+ * @monitor_page:		Page of interrupt statistics of User VM
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -170,6 +171,7 @@ struct acrn_vm {
 	struct acrn_io_request_buffer	*ioreq_buf;
 	struct page			*ioreq_page;
 	u32				pci_conf_addr;
+	struct page			*monitor_page;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -196,4 +198,6 @@ struct acrn_ioreq_client *acrn_ioreq_client_create(struct acrn_vm *vm,
 						   const char *name);
 void acrn_ioreq_client_destroy(struct acrn_ioreq_client *client);
 
+int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 94d70b1c1e5c..419271f32be8 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -51,7 +51,9 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_ioreq_notify notify;
 	struct acrn_ptdev_irq *irq_info;
 	struct acrn_vm_memmap memmap;
+	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
+	struct page *page;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -198,6 +200,44 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 				"Failed to reset intr for ptdev!\n");
 		kfree(irq_info);
 		break;
+	case ACRN_IOCTL_SET_IRQLINE:
+		ret = hcall_set_irqline(vm->vmid, ioctl_param);
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to set interrupt line!\n");
+		break;
+	case ACRN_IOCTL_INJECT_MSI:
+		msi = memdup_user((void __user *)ioctl_param,
+				  sizeof(struct acrn_msi_entry));
+		if (IS_ERR(msi))
+			return PTR_ERR(msi);
+
+		ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi));
+		if (ret < 0)
+			dev_dbg(acrn_dev.this_device,
+				"Failed to inject MSI!\n");
+		kfree(msi);
+		break;
+	case ACRN_IOCTL_VM_INTR_MONITOR:
+		ret = pin_user_pages_fast(ioctl_param, 1,
+					  FOLL_WRITE | FOLL_LONGTERM, &page);
+		if (unlikely(ret != 1)) {
+			dev_dbg(acrn_dev.this_device,
+				"Failed to pin intr hdr buffer!\n");
+			return -EFAULT;
+		}
+
+		ret = hcall_vm_intr_monitor(vm->vmid, page_to_phys(page));
+		if (ret < 0) {
+			unpin_user_page(page);
+			dev_dbg(acrn_dev.this_device,
+				"Failed to monitor intr data!\n");
+			return ret;
+		}
+		if (vm->monitor_page)
+			unpin_user_page(vm->monitor_page);
+		vm->monitor_page = page;
+		break;
 	case ACRN_IOCTL_CREATE_IOREQ_CLIENT:
 		if (vm->default_client)
 			return -EEXIST;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index f448301832cf..a8813397a3fe 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -21,6 +21,11 @@
 #define HC_RESET_VM			_HC_ID(HC_ID, HC_ID_VM_BASE + 0x05)
 #define HC_SET_VCPU_REGS		_HC_ID(HC_ID, HC_ID_VM_BASE + 0x06)
 
+#define HC_ID_IRQ_BASE			0x20UL
+#define HC_INJECT_MSI			_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x03)
+#define HC_VM_INTR_MONITOR		_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x04)
+#define HC_SET_IRQLINE			_HC_ID(HC_ID, HC_ID_IRQ_BASE + 0x05)
+
 #define HC_ID_IOREQ_BASE		0x30UL
 #define HC_SET_IOREQ_BUFFER		_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x00)
 #define HC_NOTIFY_REQUEST_FINISH	_HC_ID(HC_ID, HC_ID_IOREQ_BASE + 0x01)
@@ -101,6 +106,42 @@ static inline long hcall_set_vcpu_regs(u64 vmid, u64 regs_state)
 	return acrn_hypercall2(HC_SET_VCPU_REGS, vmid, regs_state);
 }
 
+/**
+ * hcall_inject_msi() - Deliver a MSI interrupt to a User VM
+ * @vmid:	User VM ID
+ * @msi:	Service VM GPA of MSI message
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_inject_msi(u64 vmid, u64 msi)
+{
+	return acrn_hypercall2(HC_INJECT_MSI, vmid, msi);
+}
+
+/**
+ * hcall_vm_intr_monitor() - Set a shared page for User VM interrupt statistics
+ * @vmid:	User VM ID
+ * @addr:	Service VM GPA of the shared page
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_vm_intr_monitor(u64 vmid, u64 addr)
+{
+	return acrn_hypercall2(HC_VM_INTR_MONITOR, vmid, addr);
+}
+
+/**
+ * hcall_set_irqline() - Set or clear an interrupt line
+ * @vmid:	User VM ID
+ * @op:		Service VM GPA of interrupt line operations
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static inline long hcall_set_irqline(u64 vmid, u64 op)
+{
+	return acrn_hypercall2(HC_SET_IRQLINE, vmid, op);
+}
+
 /**
  * hcall_set_ioreq_buffer() - Set up the shared buffer for I/O Requests.
  * @vmid:	User VM ID
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 2bc649c7c1d3..db597f27690a 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -68,6 +68,10 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioreq_deinit(vm);
+	if (vm->monitor_page) {
+		put_page(vm->monitor_page);
+		vm->monitor_page = NULL;
+	}
 
 	ret = hcall_destroy_vm(vm->vmid);
 	if (ret < 0) {
@@ -83,3 +87,35 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	vm->vmid = ACRN_INVALID_VMID;
 	return 0;
 }
+
+/**
+ * acrn_inject_msi() - Inject a MSI interrupt into a User VM
+ * @vm:		User VM
+ * @msi_addr:	The MSI address
+ * @msi_data:	The MSI data
+ *
+ * Return: 0 on success, <0 on error
+ */
+int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data)
+{
+	struct acrn_msi_entry *msi;
+	int ret;
+
+	/* might be used in interrupt context, so use GFP_ATOMIC */
+	msi = kzalloc(sizeof(*msi), GFP_ATOMIC);
+	if (!msi)
+		return -ENOMEM;
+
+	/*
+	 * msi_addr: addr[19:12] with dest vcpu id
+	 * msi_data: data[7:0] with vector
+	 */
+	msi->msi_addr = msi_addr;
+	msi->msi_data = msi_data;
+	ret = hcall_inject_msi(vm->vmid, virt_to_phys(msi));
+	if (ret < 0)
+		dev_err(acrn_dev.this_device,
+			"Failed to inject MSI to VM %u!\n", vm->vmid);
+	kfree(msi);
+	return ret;
+}
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b25ca8c33b92..b1c06b28ebdc 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -417,6 +417,16 @@ struct acrn_pcidev {
 	__u32	bar[ACRN_PCI_NUM_BARS];
 };
 
+/**
+ * struct acrn_msi_entry - Info for injecting a MSI interrupt to a VM
+ * @msi_addr:	MSI addr[19:12] with dest vCPU ID
+ * @msi_data:	MSI data[7:0] with vector
+ */
+struct acrn_msi_entry {
+	__u64	msi_addr;
+	__u64	msi_data;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -436,6 +446,13 @@ struct acrn_pcidev {
 #define ACRN_IOCTL_SET_VCPU_REGS	\
 	_IOW(ACRN_IOCTL_TYPE, 0x16, struct acrn_vcpu_regs)
 
+#define ACRN_IOCTL_INJECT_MSI		\
+	_IOW(ACRN_IOCTL_TYPE, 0x23, struct acrn_msi_entry)
+#define ACRN_IOCTL_VM_INTR_MONITOR	\
+	_IOW(ACRN_IOCTL_TYPE, 0x24, unsigned long)
+#define ACRN_IOCTL_SET_IRQLINE		\
+	_IOW(ACRN_IOCTL_TYPE, 0x25, __u64)
+
 #define ACRN_IOCTL_NOTIFY_REQUEST_FINISH \
 	_IOW(ACRN_IOCTL_TYPE, 0x31, struct acrn_ioreq_notify)
 #define ACRN_IOCTL_CREATE_IOREQ_CLIENT	\
-- 
cgit v1.2.3


From 3d679d5aec648f50e645702929890b9611998a0b Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:35 +0800
Subject: virt: acrn: Introduce interfaces to query C-states and P-states
 allowed by hypervisor

The C-states and P-states data are used to support CPU power management.
The hypervisor controls C-states and P-states for a User VM.

ACRN userspace need to query the data from the hypervisor to build ACPI
tables for a User VM.

HSM provides ioctls for ACRN userspace to query C-states and P-states
data obtained from the hypervisor.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-14-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/hsm.c       | 69 +++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/hypercall.h | 12 ++++++++
 include/uapi/linux/acrn.h     | 55 ++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+)

(limited to 'include')

diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 419271f32be8..b7f2deddc3e7 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -38,6 +38,67 @@ static int acrn_dev_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int pmcmd_ioctl(u64 cmd, void __user *uptr)
+{
+	struct acrn_pstate_data *px_data;
+	struct acrn_cstate_data *cx_data;
+	u64 *pm_info;
+	int ret = 0;
+
+	switch (cmd & PMCMD_TYPE_MASK) {
+	case ACRN_PMCMD_GET_PX_CNT:
+	case ACRN_PMCMD_GET_CX_CNT:
+		pm_info = kmalloc(sizeof(u64), GFP_KERNEL);
+		if (!pm_info)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(pm_info));
+		if (ret < 0) {
+			kfree(pm_info);
+			break;
+		}
+
+		if (copy_to_user(uptr, pm_info, sizeof(u64)))
+			ret = -EFAULT;
+		kfree(pm_info);
+		break;
+	case ACRN_PMCMD_GET_PX_DATA:
+		px_data = kmalloc(sizeof(*px_data), GFP_KERNEL);
+		if (!px_data)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(px_data));
+		if (ret < 0) {
+			kfree(px_data);
+			break;
+		}
+
+		if (copy_to_user(uptr, px_data, sizeof(*px_data)))
+			ret = -EFAULT;
+		kfree(px_data);
+		break;
+	case ACRN_PMCMD_GET_CX_DATA:
+		cx_data = kmalloc(sizeof(*cx_data), GFP_KERNEL);
+		if (!cx_data)
+			return -ENOMEM;
+
+		ret = hcall_get_cpu_state(cmd, virt_to_phys(cx_data));
+		if (ret < 0) {
+			kfree(cx_data);
+			break;
+		}
+
+		if (copy_to_user(uptr, cx_data, sizeof(*cx_data)))
+			ret = -EFAULT;
+		kfree(cx_data);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
 /*
  * HSM relies on hypercall layer of the ACRN hypervisor to do the
  * sanity check against the input parameters.
@@ -54,6 +115,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
 	struct page *page;
+	u64 cstate_cmd;
 	int i, ret = 0;
 
 	if (vm->vmid == ACRN_INVALID_VMID && cmd != ACRN_IOCTL_CREATE_VM) {
@@ -267,6 +329,13 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	case ACRN_IOCTL_CLEAR_VM_IOREQ:
 		acrn_ioreq_request_clear(vm);
 		break;
+	case ACRN_IOCTL_PM_GET_CPU_STATE:
+		if (copy_from_user(&cstate_cmd, (void *)ioctl_param,
+				   sizeof(cstate_cmd)))
+			return -EFAULT;
+
+		ret = pmcmd_ioctl(cstate_cmd, (void __user *)ioctl_param);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/hypercall.h b/drivers/virt/acrn/hypercall.h
index a8813397a3fe..e640632366f0 100644
--- a/drivers/virt/acrn/hypercall.h
+++ b/drivers/virt/acrn/hypercall.h
@@ -39,6 +39,9 @@
 #define HC_ASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x05)
 #define HC_DEASSIGN_PCIDEV		_HC_ID(HC_ID, HC_ID_PCI_BASE + 0x06)
 
+#define HC_ID_PM_BASE			0x80UL
+#define HC_PM_GET_CPU_STATE		_HC_ID(HC_ID, HC_ID_PM_BASE + 0x00)
+
 /**
  * hcall_create_vm() - Create a User VM
  * @vminfo:	Service VM GPA of info of User VM creation
@@ -225,4 +228,13 @@ static inline long hcall_reset_ptdev_intr(u64 vmid, u64 irq)
 	return acrn_hypercall2(HC_RESET_PTDEV_INTR, vmid, irq);
 }
 
+/*
+ * hcall_get_cpu_state() - Get P-states and C-states info from the hypervisor
+ * @state:	Service VM GPA of buffer of P-states and C-states
+ */
+static inline long hcall_get_cpu_state(u64 cmd, u64 state)
+{
+	return acrn_hypercall2(HC_PM_GET_CPU_STATE, cmd, state);
+}
+
 #endif /* __ACRN_HSM_HYPERCALL_H */
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index b1c06b28ebdc..e997d80a0cc7 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -427,6 +427,58 @@ struct acrn_msi_entry {
 	__u64	msi_data;
 };
 
+struct acrn_acpi_generic_address {
+	__u8	space_id;
+	__u8	bit_width;
+	__u8	bit_offset;
+	__u8	access_size;
+	__u64	address;
+} __attribute__ ((__packed__));
+
+/**
+ * struct acrn_cstate_data - A C state package defined in ACPI
+ * @cx_reg:	Register of the C state object
+ * @type:	Type of the C state object
+ * @latency:	The worst-case latency to enter and exit this C state
+ * @power:	The average power consumption when in this C state
+ */
+struct acrn_cstate_data {
+	struct acrn_acpi_generic_address	cx_reg;
+	__u8					type;
+	__u32					latency;
+	__u64					power;
+};
+
+/**
+ * struct acrn_pstate_data - A P state package defined in ACPI
+ * @core_frequency:	CPU frequency (in MHz).
+ * @power:		Power dissipation (in milliwatts).
+ * @transition_latency:	The worst-case latency in microseconds that CPU is
+ * 			unavailable during a transition from any P state to
+ * 			this P state.
+ * @bus_master_latency:	The worst-case latency in microseconds that Bus Masters
+ * 			are prevented from accessing memory during a transition
+ * 			from any P state to this P state.
+ * @control:		The value to be written to Performance Control Register
+ * @status:		Transition status.
+ */
+struct acrn_pstate_data {
+	__u64	core_frequency;
+	__u64	power;
+	__u64	transition_latency;
+	__u64	bus_master_latency;
+	__u64	control;
+	__u64	status;
+};
+
+#define PMCMD_TYPE_MASK		0x000000ff
+enum acrn_pm_cmd_type {
+	ACRN_PMCMD_GET_PX_CNT,
+	ACRN_PMCMD_GET_PX_DATA,
+	ACRN_PMCMD_GET_CX_CNT,
+	ACRN_PMCMD_GET_CX_DATA,
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -478,4 +530,7 @@ struct acrn_msi_entry {
 #define ACRN_IOCTL_DEASSIGN_PCIDEV	\
 	_IOW(ACRN_IOCTL_TYPE, 0x56, struct acrn_pcidev)
 
+#define ACRN_IOCTL_PM_GET_CPU_STATE	\
+	_IOWR(ACRN_IOCTL_TYPE, 0x60, __u64)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From d8ad515156b66e7e79a6e4c814f997ee54eb47c7 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:37 +0800
Subject: virt: acrn: Introduce ioeventfd

ioeventfd is a mechanism to register PIO/MMIO regions to trigger an
eventfd signal when written to by a User VM. ACRN userspace can register
any arbitrary I/O address with a corresponding eventfd and then pass the
eventfd to a specific end-point of interest for handling.

Vhost is a kernel-level virtio server which uses eventfd for signalling.
To support vhost on ACRN, ioeventfd is introduced in HSM.

A new I/O client dedicated to ioeventfd is associated with a User VM
during VM creation. HSM provides ioctls to associate an I/O region with
a eventfd. The I/O client signals a eventfd once its corresponding I/O
region is matched with an I/O request.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-16-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Kconfig     |   1 +
 drivers/virt/acrn/Makefile    |   2 +-
 drivers/virt/acrn/acrn_drv.h  |  10 ++
 drivers/virt/acrn/hsm.c       |  11 ++
 drivers/virt/acrn/ioeventfd.c | 273 ++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c        |   2 +
 include/uapi/linux/acrn.h     |  29 +++++
 7 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 drivers/virt/acrn/ioeventfd.c

(limited to 'include')

diff --git a/drivers/virt/acrn/Kconfig b/drivers/virt/acrn/Kconfig
index 36c80378c30c..3e1a61c9d8d8 100644
--- a/drivers/virt/acrn/Kconfig
+++ b/drivers/virt/acrn/Kconfig
@@ -2,6 +2,7 @@
 config ACRN_HSM
 	tristate "ACRN Hypervisor Service Module"
 	depends on ACRN_GUEST
+	select EVENTFD
 	help
 	  ACRN Hypervisor Service Module (HSM) is a kernel module which
 	  communicates with ACRN userspace through ioctls and talks to
diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 21721cbf6a80..755b583b32ca 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o ioreq.o
+acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 8a7d7721f505..e3f8190bd972 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -156,6 +156,9 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioreq_page:			The page of the I/O request shared buffer
  * @pci_conf_addr:		Address of a PCI configuration access emulation
  * @monitor_page:		Page of interrupt statistics of User VM
+ * @ioeventfds_lock:		Lock to protect ioeventfds list
+ * @ioeventfds:			List to link all hsm_ioeventfd
+ * @ioeventfd_client:		I/O client for ioeventfds of the VM
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -172,6 +175,9 @@ struct acrn_vm {
 	struct page			*ioreq_page;
 	u32				pci_conf_addr;
 	struct page			*monitor_page;
+	struct mutex			ioeventfds_lock;
+	struct list_head		ioeventfds;
+	struct acrn_ioreq_client	*ioeventfd_client;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -204,4 +210,8 @@ void acrn_ioreq_range_del(struct acrn_ioreq_client *client,
 
 int acrn_msi_inject(struct acrn_vm *vm, u64 msi_addr, u64 msi_data);
 
+int acrn_ioeventfd_init(struct acrn_vm *vm);
+int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args);
+void acrn_ioeventfd_deinit(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index b7f2deddc3e7..0b4e88b0b6bc 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -111,6 +111,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vcpu_regs *cpu_regs;
 	struct acrn_ioreq_notify notify;
 	struct acrn_ptdev_irq *irq_info;
+	struct acrn_ioeventfd ioeventfd;
 	struct acrn_vm_memmap memmap;
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
@@ -336,6 +337,16 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = pmcmd_ioctl(cstate_cmd, (void __user *)ioctl_param);
 		break;
+	case ACRN_IOCTL_IOEVENTFD:
+		if (copy_from_user(&ioeventfd, (void __user *)ioctl_param,
+				   sizeof(ioeventfd)))
+			return -EFAULT;
+
+		if (ioeventfd.reserved != 0)
+			return -EINVAL;
+
+		ret = acrn_ioeventfd_config(vm, &ioeventfd);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c
new file mode 100644
index 000000000000..ac4037e9f947
--- /dev/null
+++ b/drivers/virt/acrn/ioeventfd.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN HSM eventfd - use eventfd objects to signal expected I/O requests
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+/**
+ * struct hsm_ioeventfd - Properties of HSM ioeventfd
+ * @list:	Entry within &acrn_vm.ioeventfds of ioeventfds of a VM
+ * @eventfd:	Eventfd of the HSM ioeventfd
+ * @addr:	Address of I/O range
+ * @data:	Data for matching
+ * @length:	Length of I/O range
+ * @type:	Type of I/O range (ACRN_IOREQ_TYPE_MMIO/ACRN_IOREQ_TYPE_PORTIO)
+ * @wildcard:	Data matching or not
+ */
+struct hsm_ioeventfd {
+	struct list_head	list;
+	struct eventfd_ctx	*eventfd;
+	u64			addr;
+	u64			data;
+	int			length;
+	int			type;
+	bool			wildcard;
+};
+
+static inline int ioreq_type_from_flags(int flags)
+{
+	return flags & ACRN_IOEVENTFD_FLAG_PIO ?
+		       ACRN_IOREQ_TYPE_PORTIO : ACRN_IOREQ_TYPE_MMIO;
+}
+
+static void acrn_ioeventfd_shutdown(struct acrn_vm *vm, struct hsm_ioeventfd *p)
+{
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	eventfd_ctx_put(p->eventfd);
+	list_del(&p->list);
+	kfree(p);
+}
+
+static bool hsm_ioeventfd_is_conflict(struct acrn_vm *vm,
+				      struct hsm_ioeventfd *ioeventfd)
+{
+	struct hsm_ioeventfd *p;
+
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	/* Either one is wildcard, the data matching will be skipped. */
+	list_for_each_entry(p, &vm->ioeventfds, list)
+		if (p->eventfd == ioeventfd->eventfd &&
+		    p->addr == ioeventfd->addr &&
+		    p->type == ioeventfd->type &&
+		    (p->wildcard || ioeventfd->wildcard ||
+			p->data == ioeventfd->data))
+			return true;
+
+	return false;
+}
+
+/*
+ * Assign an eventfd to a VM and create a HSM ioeventfd associated with the
+ * eventfd. The properties of the HSM ioeventfd are built from a &struct
+ * acrn_ioeventfd.
+ */
+static int acrn_ioeventfd_assign(struct acrn_vm *vm,
+				 struct acrn_ioeventfd *args)
+{
+	struct eventfd_ctx *eventfd;
+	struct hsm_ioeventfd *p;
+	int ret;
+
+	/* Check for range overflow */
+	if (args->addr + args->len < args->addr)
+		return -EINVAL;
+
+	/*
+	 * Currently, acrn_ioeventfd is used to support vhost. 1,2,4,8 width
+	 * accesses can cover vhost's requirements.
+	 */
+	if (!(args->len == 1 || args->len == 2 ||
+	      args->len == 4 || args->len == 8))
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&p->list);
+	p->addr = args->addr;
+	p->length = args->len;
+	p->eventfd = eventfd;
+	p->type = ioreq_type_from_flags(args->flags);
+
+	/*
+	 * ACRN_IOEVENTFD_FLAG_DATAMATCH flag is set in virtio 1.0 support, the
+	 * writing of notification register of each virtqueue may trigger the
+	 * notification. There is no data matching requirement.
+	 */
+	if (args->flags & ACRN_IOEVENTFD_FLAG_DATAMATCH)
+		p->data = args->data;
+	else
+		p->wildcard = true;
+
+	mutex_lock(&vm->ioeventfds_lock);
+
+	if (hsm_ioeventfd_is_conflict(vm, p)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+
+	/* register the I/O range into ioreq client */
+	ret = acrn_ioreq_range_add(vm->ioeventfd_client, p->type,
+				   p->addr, p->addr + p->length - 1);
+	if (ret < 0)
+		goto unlock_fail;
+
+	list_add_tail(&p->list, &vm->ioeventfds);
+	mutex_unlock(&vm->ioeventfds_lock);
+
+	return 0;
+
+unlock_fail:
+	mutex_unlock(&vm->ioeventfds_lock);
+	kfree(p);
+fail:
+	eventfd_ctx_put(eventfd);
+	return ret;
+}
+
+static int acrn_ioeventfd_deassign(struct acrn_vm *vm,
+				   struct acrn_ioeventfd *args)
+{
+	struct hsm_ioeventfd *p;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&vm->ioeventfds_lock);
+	list_for_each_entry(p, &vm->ioeventfds, list) {
+		if (p->eventfd != eventfd)
+			continue;
+
+		acrn_ioreq_range_del(vm->ioeventfd_client, p->type,
+				     p->addr, p->addr + p->length - 1);
+		acrn_ioeventfd_shutdown(vm, p);
+		break;
+	}
+	mutex_unlock(&vm->ioeventfds_lock);
+
+	eventfd_ctx_put(eventfd);
+	return 0;
+}
+
+static struct hsm_ioeventfd *hsm_ioeventfd_match(struct acrn_vm *vm, u64 addr,
+						 u64 data, int len, int type)
+{
+	struct hsm_ioeventfd *p = NULL;
+
+	lockdep_assert_held(&vm->ioeventfds_lock);
+
+	list_for_each_entry(p, &vm->ioeventfds, list) {
+		if (p->type == type && p->addr == addr && p->length >= len &&
+		    (p->wildcard || p->data == data))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client,
+				  struct acrn_io_request *req)
+{
+	struct hsm_ioeventfd *p;
+	u64 addr, val;
+	int size;
+
+	if (req->type == ACRN_IOREQ_TYPE_MMIO) {
+		/*
+		 * I/O requests are dispatched by range check only, so a
+		 * acrn_ioreq_client need process both READ and WRITE accesses
+		 * of same range. READ accesses are safe to be ignored here
+		 * because virtio PCI devices write the notify registers for
+		 * notification.
+		 */
+		if (req->reqs.mmio_request.direction == ACRN_IOREQ_DIR_READ) {
+			/* reading does nothing and return 0 */
+			req->reqs.mmio_request.value = 0;
+			return 0;
+		}
+		addr = req->reqs.mmio_request.address;
+		size = req->reqs.mmio_request.size;
+		val = req->reqs.mmio_request.value;
+	} else {
+		if (req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ) {
+			/* reading does nothing and return 0 */
+			req->reqs.pio_request.value = 0;
+			return 0;
+		}
+		addr = req->reqs.pio_request.address;
+		size = req->reqs.pio_request.size;
+		val = req->reqs.pio_request.value;
+	}
+
+	mutex_lock(&client->vm->ioeventfds_lock);
+	p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type);
+	if (p)
+		eventfd_signal(p->eventfd, 1);
+	mutex_unlock(&client->vm->ioeventfds_lock);
+
+	return 0;
+}
+
+int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args)
+{
+	int ret;
+
+	if (args->flags & ACRN_IOEVENTFD_FLAG_DEASSIGN)
+		ret = acrn_ioeventfd_deassign(vm, args);
+	else
+		ret = acrn_ioeventfd_assign(vm, args);
+
+	return ret;
+}
+
+int acrn_ioeventfd_init(struct acrn_vm *vm)
+{
+	char name[ACRN_NAME_LEN];
+
+	mutex_init(&vm->ioeventfds_lock);
+	INIT_LIST_HEAD(&vm->ioeventfds);
+	snprintf(name, sizeof(name), "ioeventfd-%u", vm->vmid);
+	vm->ioeventfd_client = acrn_ioreq_client_create(vm,
+							acrn_ioeventfd_handler,
+							NULL, false, name);
+	if (!vm->ioeventfd_client) {
+		dev_err(acrn_dev.this_device, "Failed to create ioeventfd ioreq client!\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(acrn_dev.this_device, "VM %u ioeventfd init.\n", vm->vmid);
+	return 0;
+}
+
+void acrn_ioeventfd_deinit(struct acrn_vm *vm)
+{
+	struct hsm_ioeventfd *p, *next;
+
+	dev_dbg(acrn_dev.this_device, "VM %u ioeventfd deinit.\n", vm->vmid);
+	acrn_ioreq_client_destroy(vm->ioeventfd_client);
+	mutex_lock(&vm->ioeventfds_lock);
+	list_for_each_entry_safe(p, next, &vm->ioeventfds, list)
+		acrn_ioeventfd_shutdown(vm, p);
+	mutex_unlock(&vm->ioeventfds_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index db597f27690a..2b322d956a8c 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -50,6 +50,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	list_add(&vm->list, &acrn_vm_list);
 	write_unlock_bh(&acrn_vm_list_lock);
 
+	acrn_ioeventfd_init(vm);
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
 }
@@ -67,6 +68,7 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	list_del_init(&vm->list);
 	write_unlock_bh(&acrn_vm_list_lock);
 
+	acrn_ioeventfd_deinit(vm);
 	acrn_ioreq_deinit(vm);
 	if (vm->monitor_page) {
 		put_page(vm->monitor_page);
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index e997d80a0cc7..132c5ff95967 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -479,6 +479,32 @@ enum acrn_pm_cmd_type {
 	ACRN_PMCMD_GET_CX_DATA,
 };
 
+#define ACRN_IOEVENTFD_FLAG_PIO		0x01
+#define ACRN_IOEVENTFD_FLAG_DATAMATCH	0x02
+#define ACRN_IOEVENTFD_FLAG_DEASSIGN	0x04
+/**
+ * struct acrn_ioeventfd - Data to operate a &struct hsm_ioeventfd
+ * @fd:		The fd of eventfd associated with a hsm_ioeventfd
+ * @flags:	Logical-OR of ACRN_IOEVENTFD_FLAG_*
+ * @addr:	The start address of IO range of ioeventfd
+ * @len:	The length of IO range of ioeventfd
+ * @reserved:	Reserved and should be 0
+ * @data:	Data for data matching
+ *
+ * Without flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl ACRN_IOCTL_IOEVENTFD
+ * creates a &struct hsm_ioeventfd with properties originated from &struct
+ * acrn_ioeventfd. With flag ACRN_IOEVENTFD_FLAG_DEASSIGN, ioctl
+ * ACRN_IOCTL_IOEVENTFD destroys the &struct hsm_ioeventfd matching the fd.
+ */
+struct acrn_ioeventfd {
+	__u32	fd;
+	__u32	flags;
+	__u64	addr;
+	__u32	len;
+	__u32	reserved;
+	__u64	data;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -533,4 +559,7 @@ enum acrn_pm_cmd_type {
 #define ACRN_IOCTL_PM_GET_CPU_STATE	\
 	_IOWR(ACRN_IOCTL_TYPE, 0x60, __u64)
 
+#define ACRN_IOCTL_IOEVENTFD		\
+	_IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd)
+
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From aa3b483ff1d71c50b33db154048dff9a8f08ac71 Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 7 Feb 2021 11:10:38 +0800
Subject: virt: acrn: Introduce irqfd

irqfd is a mechanism to inject a specific interrupt to a User VM using a
decoupled eventfd mechanism.

Vhost is a kernel-level virtio server which uses eventfd for interrupt
injection. To support vhost on ACRN, irqfd is introduced in HSM.

HSM provides ioctls to associate a virtual Message Signaled Interrupt
(MSI) with an eventfd. The corresponding virtual MSI will be injected
into a User VM once the eventfd got signal.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Yu Wang <yu1.wang@intel.com>
Cc: Reinette Chatre <reinette.chatre@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210207031040.49576-17-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/virt/acrn/Makefile   |   2 +-
 drivers/virt/acrn/acrn_drv.h |  10 ++
 drivers/virt/acrn/hsm.c      |   7 ++
 drivers/virt/acrn/irqfd.c    | 235 +++++++++++++++++++++++++++++++++++++++++++
 drivers/virt/acrn/vm.c       |   3 +
 include/uapi/linux/acrn.h    |  15 +++
 6 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 drivers/virt/acrn/irqfd.c

(limited to 'include')

diff --git a/drivers/virt/acrn/Makefile b/drivers/virt/acrn/Makefile
index 755b583b32ca..08ce641dcfa1 100644
--- a/drivers/virt/acrn/Makefile
+++ b/drivers/virt/acrn/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ACRN_HSM)	:= acrn.o
-acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o
+acrn-y := hsm.o vm.o mm.o ioreq.o ioeventfd.o irqfd.o
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index e3f8190bd972..1be54efa666c 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -159,6 +159,9 @@ extern rwlock_t acrn_vm_list_lock;
  * @ioeventfds_lock:		Lock to protect ioeventfds list
  * @ioeventfds:			List to link all hsm_ioeventfd
  * @ioeventfd_client:		I/O client for ioeventfds of the VM
+ * @irqfds_lock:		Lock to protect irqfds list
+ * @irqfds:			List to link all hsm_irqfd
+ * @irqfd_wq:			Workqueue for irqfd async shutdown
  */
 struct acrn_vm {
 	struct list_head		list;
@@ -178,6 +181,9 @@ struct acrn_vm {
 	struct mutex			ioeventfds_lock;
 	struct list_head		ioeventfds;
 	struct acrn_ioreq_client	*ioeventfd_client;
+	struct mutex			irqfds_lock;
+	struct list_head		irqfds;
+	struct workqueue_struct		*irqfd_wq;
 };
 
 struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
@@ -214,4 +220,8 @@ int acrn_ioeventfd_init(struct acrn_vm *vm);
 int acrn_ioeventfd_config(struct acrn_vm *vm, struct acrn_ioeventfd *args);
 void acrn_ioeventfd_deinit(struct acrn_vm *vm);
 
+int acrn_irqfd_init(struct acrn_vm *vm);
+int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args);
+void acrn_irqfd_deinit(struct acrn_vm *vm);
+
 #endif /* __ACRN_HSM_DRV_H */
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 0b4e88b0b6bc..93f6abe10fa6 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -115,6 +115,7 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 	struct acrn_vm_memmap memmap;
 	struct acrn_msi_entry *msi;
 	struct acrn_pcidev *pcidev;
+	struct acrn_irqfd irqfd;
 	struct page *page;
 	u64 cstate_cmd;
 	int i, ret = 0;
@@ -347,6 +348,12 @@ static long acrn_dev_ioctl(struct file *filp, unsigned int cmd,
 
 		ret = acrn_ioeventfd_config(vm, &ioeventfd);
 		break;
+	case ACRN_IOCTL_IRQFD:
+		if (copy_from_user(&irqfd, (void __user *)ioctl_param,
+				   sizeof(irqfd)))
+			return -EFAULT;
+		ret = acrn_irqfd_config(vm, &irqfd);
+		break;
 	default:
 		dev_dbg(acrn_dev.this_device, "Unknown IOCTL 0x%x!\n", cmd);
 		ret = -ENOTTY;
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
new file mode 100644
index 000000000000..a8766d528e29
--- /dev/null
+++ b/drivers/virt/acrn/irqfd.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN HSM irqfd: use eventfd objects to inject virtual interrupts
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Authors:
+ *	Shuo Liu <shuo.a.liu@intel.com>
+ *	Yakui Zhao <yakui.zhao@intel.com>
+ */
+
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+
+#include "acrn_drv.h"
+
+static LIST_HEAD(acrn_irqfd_clients);
+static DEFINE_MUTEX(acrn_irqfds_mutex);
+
+/**
+ * struct hsm_irqfd - Properties of HSM irqfd
+ * @vm:		Associated VM pointer
+ * @wait:	Entry of wait-queue
+ * @shutdown:	Async shutdown work
+ * @eventfd:	Associated eventfd
+ * @list:	Entry within &acrn_vm.irqfds of irqfds of a VM
+ * @pt:		Structure for select/poll on the associated eventfd
+ * @msi:	MSI data
+ */
+struct hsm_irqfd {
+	struct acrn_vm		*vm;
+	wait_queue_entry_t	wait;
+	struct work_struct	shutdown;
+	struct eventfd_ctx	*eventfd;
+	struct list_head	list;
+	poll_table		pt;
+	struct acrn_msi_entry	msi;
+};
+
+static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
+{
+	struct acrn_vm *vm = irqfd->vm;
+
+	acrn_msi_inject(vm, irqfd->msi.msi_addr,
+			irqfd->msi.msi_data);
+}
+
+static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
+{
+	u64 cnt;
+
+	lockdep_assert_held(&irqfd->vm->irqfds_lock);
+
+	/* remove from wait queue */
+	list_del_init(&irqfd->list);
+	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
+	eventfd_ctx_put(irqfd->eventfd);
+	kfree(irqfd);
+}
+
+static void hsm_irqfd_shutdown_work(struct work_struct *work)
+{
+	struct hsm_irqfd *irqfd;
+	struct acrn_vm *vm;
+
+	irqfd = container_of(work, struct hsm_irqfd, shutdown);
+	vm = irqfd->vm;
+	mutex_lock(&vm->irqfds_lock);
+	if (!list_empty(&irqfd->list))
+		hsm_irqfd_shutdown(irqfd);
+	mutex_unlock(&vm->irqfds_lock);
+}
+
+/* Called with wqh->lock held and interrupts disabled */
+static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
+			    int sync, void *key)
+{
+	unsigned long poll_bits = (unsigned long)key;
+	struct hsm_irqfd *irqfd;
+	struct acrn_vm *vm;
+
+	irqfd = container_of(wait, struct hsm_irqfd, wait);
+	vm = irqfd->vm;
+	if (poll_bits & POLLIN)
+		/* An event has been signaled, inject an interrupt */
+		acrn_irqfd_inject(irqfd);
+
+	if (poll_bits & POLLHUP)
+		/* Do shutdown work in thread to hold wqh->lock */
+		queue_work(vm->irqfd_wq, &irqfd->shutdown);
+
+	return 0;
+}
+
+static void hsm_irqfd_poll_func(struct file *file, wait_queue_head_t *wqh,
+				poll_table *pt)
+{
+	struct hsm_irqfd *irqfd;
+
+	irqfd = container_of(pt, struct hsm_irqfd, pt);
+	add_wait_queue(wqh, &irqfd->wait);
+}
+
+/*
+ * Assign an eventfd to a VM and create a HSM irqfd associated with the
+ * eventfd. The properties of the HSM irqfd are built from a &struct
+ * acrn_irqfd.
+ */
+static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
+{
+	struct eventfd_ctx *eventfd = NULL;
+	struct hsm_irqfd *irqfd, *tmp;
+	unsigned int events;
+	struct fd f;
+	int ret = 0;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->vm = vm;
+	memcpy(&irqfd->msi, &args->msi, sizeof(args->msi));
+	INIT_LIST_HEAD(&irqfd->list);
+	INIT_WORK(&irqfd->shutdown, hsm_irqfd_shutdown_work);
+
+	f = fdget(args->fd);
+	if (!f.file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	eventfd = eventfd_ctx_fileget(f.file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	irqfd->eventfd = eventfd;
+
+	/*
+	 * Install custom wake-up handling to be notified whenever underlying
+	 * eventfd is signaled.
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup);
+	init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func);
+
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry(tmp, &vm->irqfds, list) {
+		if (irqfd->eventfd != tmp->eventfd)
+			continue;
+		ret = -EBUSY;
+		mutex_unlock(&vm->irqfds_lock);
+		goto fail;
+	}
+	list_add_tail(&irqfd->list, &vm->irqfds);
+	mutex_unlock(&vm->irqfds_lock);
+
+	/* Check the pending event in this stage */
+	events = f.file->f_op->poll(f.file, &irqfd->pt);
+
+	if (events & POLLIN)
+		acrn_irqfd_inject(irqfd);
+
+	fdput(f);
+	return 0;
+fail:
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	fdput(f);
+out:
+	kfree(irqfd);
+	return ret;
+}
+
+static int acrn_irqfd_deassign(struct acrn_vm *vm,
+			       struct acrn_irqfd *args)
+{
+	struct hsm_irqfd *irqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
+		if (irqfd->eventfd == eventfd) {
+			hsm_irqfd_shutdown(irqfd);
+			break;
+		}
+	}
+	mutex_unlock(&vm->irqfds_lock);
+	eventfd_ctx_put(eventfd);
+
+	return 0;
+}
+
+int acrn_irqfd_config(struct acrn_vm *vm, struct acrn_irqfd *args)
+{
+	int ret;
+
+	if (args->flags & ACRN_IRQFD_FLAG_DEASSIGN)
+		ret = acrn_irqfd_deassign(vm, args);
+	else
+		ret = acrn_irqfd_assign(vm, args);
+
+	return ret;
+}
+
+int acrn_irqfd_init(struct acrn_vm *vm)
+{
+	INIT_LIST_HEAD(&vm->irqfds);
+	mutex_init(&vm->irqfds_lock);
+	vm->irqfd_wq = alloc_workqueue("acrn_irqfd-%u", 0, 0, vm->vmid);
+	if (!vm->irqfd_wq)
+		return -ENOMEM;
+
+	dev_dbg(acrn_dev.this_device, "VM %u irqfd init.\n", vm->vmid);
+	return 0;
+}
+
+void acrn_irqfd_deinit(struct acrn_vm *vm)
+{
+	struct hsm_irqfd *irqfd, *next;
+
+	dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
+	destroy_workqueue(vm->irqfd_wq);
+	mutex_lock(&vm->irqfds_lock);
+	list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
+		hsm_irqfd_shutdown(irqfd);
+	mutex_unlock(&vm->irqfds_lock);
+}
diff --git a/drivers/virt/acrn/vm.c b/drivers/virt/acrn/vm.c
index 2b322d956a8c..7804a2492ad7 100644
--- a/drivers/virt/acrn/vm.c
+++ b/drivers/virt/acrn/vm.c
@@ -51,6 +51,7 @@ struct acrn_vm *acrn_vm_create(struct acrn_vm *vm,
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioeventfd_init(vm);
+	acrn_irqfd_init(vm);
 	dev_dbg(acrn_dev.this_device, "VM %u created.\n", vm->vmid);
 	return vm;
 }
@@ -69,7 +70,9 @@ int acrn_vm_destroy(struct acrn_vm *vm)
 	write_unlock_bh(&acrn_vm_list_lock);
 
 	acrn_ioeventfd_deinit(vm);
+	acrn_irqfd_deinit(vm);
 	acrn_ioreq_deinit(vm);
+
 	if (vm->monitor_page) {
 		put_page(vm->monitor_page);
 		vm->monitor_page = NULL;
diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h
index 132c5ff95967..353b2a2e4536 100644
--- a/include/uapi/linux/acrn.h
+++ b/include/uapi/linux/acrn.h
@@ -505,6 +505,19 @@ struct acrn_ioeventfd {
 	__u64	data;
 };
 
+#define ACRN_IRQFD_FLAG_DEASSIGN	0x01
+/**
+ * struct acrn_irqfd - Data to operate a &struct hsm_irqfd
+ * @fd:		The fd of eventfd associated with a hsm_irqfd
+ * @flags:	Logical-OR of ACRN_IRQFD_FLAG_*
+ * @msi:	Info of MSI associated with the irqfd
+ */
+struct acrn_irqfd {
+	__s32			fd;
+	__u32			flags;
+	struct acrn_msi_entry	msi;
+};
+
 /* The ioctl type, documented in ioctl-number.rst */
 #define ACRN_IOCTL_TYPE			0xA2
 
@@ -561,5 +574,7 @@ struct acrn_ioeventfd {
 
 #define ACRN_IOCTL_IOEVENTFD		\
 	_IOW(ACRN_IOCTL_TYPE, 0x70, struct acrn_ioeventfd)
+#define ACRN_IOCTL_IRQFD		\
+	_IOW(ACRN_IOCTL_TYPE, 0x71, struct acrn_irqfd)
 
 #endif /* _UAPI_ACRN_H */
-- 
cgit v1.2.3


From 1077d4367ab3b97f6db2f66c87289af863652215 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 3 Feb 2021 15:13:50 +0100
Subject: firmware: xilinx: Use explicit values for all enum values

Based on discussion at
https://lore.kernel.org/r/20200318125003.GA2727094@kroah.com we got
recommendation to use explicit values for all enum values.
The patch is following this recommendation.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/daeb67ded45d8a8f6a96717d1fb9c84439dd2ae8.1612361627.git.michal.simek@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 338 +++++++++++++++++------------------
 1 file changed, 169 insertions(+), 169 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0a483249ff30..71177b17eee5 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -64,25 +64,25 @@ enum pm_api_id {
 	PM_GET_API_VERSION = 1,
 	PM_SYSTEM_SHUTDOWN = 12,
 	PM_REQUEST_NODE = 13,
-	PM_RELEASE_NODE,
-	PM_SET_REQUIREMENT,
+	PM_RELEASE_NODE = 14,
+	PM_SET_REQUIREMENT = 15,
 	PM_RESET_ASSERT = 17,
-	PM_RESET_GET_STATUS,
+	PM_RESET_GET_STATUS = 18,
 	PM_PM_INIT_FINALIZE = 21,
-	PM_FPGA_LOAD,
-	PM_FPGA_GET_STATUS,
+	PM_FPGA_LOAD = 22,
+	PM_FPGA_GET_STATUS = 23,
 	PM_GET_CHIPID = 24,
 	PM_IOCTL = 34,
-	PM_QUERY_DATA,
-	PM_CLOCK_ENABLE,
-	PM_CLOCK_DISABLE,
-	PM_CLOCK_GETSTATE,
-	PM_CLOCK_SETDIVIDER,
-	PM_CLOCK_GETDIVIDER,
-	PM_CLOCK_SETRATE,
-	PM_CLOCK_GETRATE,
-	PM_CLOCK_SETPARENT,
-	PM_CLOCK_GETPARENT,
+	PM_QUERY_DATA = 35,
+	PM_CLOCK_ENABLE = 36,
+	PM_CLOCK_DISABLE = 37,
+	PM_CLOCK_GETSTATE = 38,
+	PM_CLOCK_SETDIVIDER = 39,
+	PM_CLOCK_GETDIVIDER = 40,
+	PM_CLOCK_SETRATE = 41,
+	PM_CLOCK_GETRATE = 42,
+	PM_CLOCK_SETPARENT = 43,
+	PM_CLOCK_GETPARENT = 44,
 	PM_SECURE_AES = 47,
 	PM_FEATURE_CHECK = 63,
 };
@@ -92,21 +92,21 @@ enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
 	XST_PM_NO_FEATURE = 19,
 	XST_PM_INTERNAL = 2000,
-	XST_PM_CONFLICT,
-	XST_PM_NO_ACCESS,
-	XST_PM_INVALID_NODE,
-	XST_PM_DOUBLE_REQ,
-	XST_PM_ABORT_SUSPEND,
+	XST_PM_CONFLICT = 2001,
+	XST_PM_NO_ACCESS = 2002,
+	XST_PM_INVALID_NODE = 2003,
+	XST_PM_DOUBLE_REQ = 2004,
+	XST_PM_ABORT_SUSPEND = 2005,
 	XST_PM_MULT_USER = 2008,
 };
 
 enum pm_ioctl_id {
 	IOCTL_SD_DLL_RESET = 6,
-	IOCTL_SET_SD_TAPDELAY,
-	IOCTL_SET_PLL_FRAC_MODE,
-	IOCTL_GET_PLL_FRAC_MODE,
-	IOCTL_SET_PLL_FRAC_DATA,
-	IOCTL_GET_PLL_FRAC_DATA,
+	IOCTL_SET_SD_TAPDELAY = 7,
+	IOCTL_SET_PLL_FRAC_MODE = 8,
+	IOCTL_GET_PLL_FRAC_MODE = 9,
+	IOCTL_SET_PLL_FRAC_DATA = 10,
+	IOCTL_GET_PLL_FRAC_DATA = 11,
 	IOCTL_WRITE_GGS = 12,
 	IOCTL_READ_GGS = 13,
 	IOCTL_WRITE_PGGS = 14,
@@ -116,185 +116,185 @@ enum pm_ioctl_id {
 };
 
 enum pm_query_id {
-	PM_QID_INVALID,
-	PM_QID_CLOCK_GET_NAME,
-	PM_QID_CLOCK_GET_TOPOLOGY,
-	PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS,
-	PM_QID_CLOCK_GET_PARENTS,
-	PM_QID_CLOCK_GET_ATTRIBUTES,
+	PM_QID_INVALID = 0,
+	PM_QID_CLOCK_GET_NAME = 1,
+	PM_QID_CLOCK_GET_TOPOLOGY = 2,
+	PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS = 3,
+	PM_QID_CLOCK_GET_PARENTS = 4,
+	PM_QID_CLOCK_GET_ATTRIBUTES = 5,
 	PM_QID_CLOCK_GET_NUM_CLOCKS = 12,
-	PM_QID_CLOCK_GET_MAX_DIVISOR,
+	PM_QID_CLOCK_GET_MAX_DIVISOR = 13,
 };
 
 enum zynqmp_pm_reset_action {
-	PM_RESET_ACTION_RELEASE,
-	PM_RESET_ACTION_ASSERT,
-	PM_RESET_ACTION_PULSE,
+	PM_RESET_ACTION_RELEASE = 0,
+	PM_RESET_ACTION_ASSERT = 1,
+	PM_RESET_ACTION_PULSE = 2,
 };
 
 enum zynqmp_pm_reset {
 	ZYNQMP_PM_RESET_START = 1000,
 	ZYNQMP_PM_RESET_PCIE_CFG = ZYNQMP_PM_RESET_START,
-	ZYNQMP_PM_RESET_PCIE_BRIDGE,
-	ZYNQMP_PM_RESET_PCIE_CTRL,
-	ZYNQMP_PM_RESET_DP,
-	ZYNQMP_PM_RESET_SWDT_CRF,
-	ZYNQMP_PM_RESET_AFI_FM5,
-	ZYNQMP_PM_RESET_AFI_FM4,
-	ZYNQMP_PM_RESET_AFI_FM3,
-	ZYNQMP_PM_RESET_AFI_FM2,
-	ZYNQMP_PM_RESET_AFI_FM1,
-	ZYNQMP_PM_RESET_AFI_FM0,
-	ZYNQMP_PM_RESET_GDMA,
-	ZYNQMP_PM_RESET_GPU_PP1,
-	ZYNQMP_PM_RESET_GPU_PP0,
-	ZYNQMP_PM_RESET_GPU,
-	ZYNQMP_PM_RESET_GT,
-	ZYNQMP_PM_RESET_SATA,
-	ZYNQMP_PM_RESET_ACPU3_PWRON,
-	ZYNQMP_PM_RESET_ACPU2_PWRON,
-	ZYNQMP_PM_RESET_ACPU1_PWRON,
-	ZYNQMP_PM_RESET_ACPU0_PWRON,
-	ZYNQMP_PM_RESET_APU_L2,
-	ZYNQMP_PM_RESET_ACPU3,
-	ZYNQMP_PM_RESET_ACPU2,
-	ZYNQMP_PM_RESET_ACPU1,
-	ZYNQMP_PM_RESET_ACPU0,
-	ZYNQMP_PM_RESET_DDR,
-	ZYNQMP_PM_RESET_APM_FPD,
-	ZYNQMP_PM_RESET_SOFT,
-	ZYNQMP_PM_RESET_GEM0,
-	ZYNQMP_PM_RESET_GEM1,
-	ZYNQMP_PM_RESET_GEM2,
-	ZYNQMP_PM_RESET_GEM3,
-	ZYNQMP_PM_RESET_QSPI,
-	ZYNQMP_PM_RESET_UART0,
-	ZYNQMP_PM_RESET_UART1,
-	ZYNQMP_PM_RESET_SPI0,
-	ZYNQMP_PM_RESET_SPI1,
-	ZYNQMP_PM_RESET_SDIO0,
-	ZYNQMP_PM_RESET_SDIO1,
-	ZYNQMP_PM_RESET_CAN0,
-	ZYNQMP_PM_RESET_CAN1,
-	ZYNQMP_PM_RESET_I2C0,
-	ZYNQMP_PM_RESET_I2C1,
-	ZYNQMP_PM_RESET_TTC0,
-	ZYNQMP_PM_RESET_TTC1,
-	ZYNQMP_PM_RESET_TTC2,
-	ZYNQMP_PM_RESET_TTC3,
-	ZYNQMP_PM_RESET_SWDT_CRL,
-	ZYNQMP_PM_RESET_NAND,
-	ZYNQMP_PM_RESET_ADMA,
-	ZYNQMP_PM_RESET_GPIO,
-	ZYNQMP_PM_RESET_IOU_CC,
-	ZYNQMP_PM_RESET_TIMESTAMP,
-	ZYNQMP_PM_RESET_RPU_R50,
-	ZYNQMP_PM_RESET_RPU_R51,
-	ZYNQMP_PM_RESET_RPU_AMBA,
-	ZYNQMP_PM_RESET_OCM,
-	ZYNQMP_PM_RESET_RPU_PGE,
-	ZYNQMP_PM_RESET_USB0_CORERESET,
-	ZYNQMP_PM_RESET_USB1_CORERESET,
-	ZYNQMP_PM_RESET_USB0_HIBERRESET,
-	ZYNQMP_PM_RESET_USB1_HIBERRESET,
-	ZYNQMP_PM_RESET_USB0_APB,
-	ZYNQMP_PM_RESET_USB1_APB,
-	ZYNQMP_PM_RESET_IPI,
-	ZYNQMP_PM_RESET_APM_LPD,
-	ZYNQMP_PM_RESET_RTC,
-	ZYNQMP_PM_RESET_SYSMON,
-	ZYNQMP_PM_RESET_AFI_FM6,
-	ZYNQMP_PM_RESET_LPD_SWDT,
-	ZYNQMP_PM_RESET_FPD,
-	ZYNQMP_PM_RESET_RPU_DBG1,
-	ZYNQMP_PM_RESET_RPU_DBG0,
-	ZYNQMP_PM_RESET_DBG_LPD,
-	ZYNQMP_PM_RESET_DBG_FPD,
-	ZYNQMP_PM_RESET_APLL,
-	ZYNQMP_PM_RESET_DPLL,
-	ZYNQMP_PM_RESET_VPLL,
-	ZYNQMP_PM_RESET_IOPLL,
-	ZYNQMP_PM_RESET_RPLL,
-	ZYNQMP_PM_RESET_GPO3_PL_0,
-	ZYNQMP_PM_RESET_GPO3_PL_1,
-	ZYNQMP_PM_RESET_GPO3_PL_2,
-	ZYNQMP_PM_RESET_GPO3_PL_3,
-	ZYNQMP_PM_RESET_GPO3_PL_4,
-	ZYNQMP_PM_RESET_GPO3_PL_5,
-	ZYNQMP_PM_RESET_GPO3_PL_6,
-	ZYNQMP_PM_RESET_GPO3_PL_7,
-	ZYNQMP_PM_RESET_GPO3_PL_8,
-	ZYNQMP_PM_RESET_GPO3_PL_9,
-	ZYNQMP_PM_RESET_GPO3_PL_10,
-	ZYNQMP_PM_RESET_GPO3_PL_11,
-	ZYNQMP_PM_RESET_GPO3_PL_12,
-	ZYNQMP_PM_RESET_GPO3_PL_13,
-	ZYNQMP_PM_RESET_GPO3_PL_14,
-	ZYNQMP_PM_RESET_GPO3_PL_15,
-	ZYNQMP_PM_RESET_GPO3_PL_16,
-	ZYNQMP_PM_RESET_GPO3_PL_17,
-	ZYNQMP_PM_RESET_GPO3_PL_18,
-	ZYNQMP_PM_RESET_GPO3_PL_19,
-	ZYNQMP_PM_RESET_GPO3_PL_20,
-	ZYNQMP_PM_RESET_GPO3_PL_21,
-	ZYNQMP_PM_RESET_GPO3_PL_22,
-	ZYNQMP_PM_RESET_GPO3_PL_23,
-	ZYNQMP_PM_RESET_GPO3_PL_24,
-	ZYNQMP_PM_RESET_GPO3_PL_25,
-	ZYNQMP_PM_RESET_GPO3_PL_26,
-	ZYNQMP_PM_RESET_GPO3_PL_27,
-	ZYNQMP_PM_RESET_GPO3_PL_28,
-	ZYNQMP_PM_RESET_GPO3_PL_29,
-	ZYNQMP_PM_RESET_GPO3_PL_30,
-	ZYNQMP_PM_RESET_GPO3_PL_31,
-	ZYNQMP_PM_RESET_RPU_LS,
-	ZYNQMP_PM_RESET_PS_ONLY,
-	ZYNQMP_PM_RESET_PL,
-	ZYNQMP_PM_RESET_PS_PL0,
-	ZYNQMP_PM_RESET_PS_PL1,
-	ZYNQMP_PM_RESET_PS_PL2,
-	ZYNQMP_PM_RESET_PS_PL3,
+	ZYNQMP_PM_RESET_PCIE_BRIDGE = 1001,
+	ZYNQMP_PM_RESET_PCIE_CTRL = 1002,
+	ZYNQMP_PM_RESET_DP = 1003,
+	ZYNQMP_PM_RESET_SWDT_CRF = 1004,
+	ZYNQMP_PM_RESET_AFI_FM5 = 1005,
+	ZYNQMP_PM_RESET_AFI_FM4 = 1006,
+	ZYNQMP_PM_RESET_AFI_FM3 = 1007,
+	ZYNQMP_PM_RESET_AFI_FM2 = 1008,
+	ZYNQMP_PM_RESET_AFI_FM1 = 1009,
+	ZYNQMP_PM_RESET_AFI_FM0 = 1010,
+	ZYNQMP_PM_RESET_GDMA = 1011,
+	ZYNQMP_PM_RESET_GPU_PP1 = 1012,
+	ZYNQMP_PM_RESET_GPU_PP0 = 1013,
+	ZYNQMP_PM_RESET_GPU = 1014,
+	ZYNQMP_PM_RESET_GT = 1015,
+	ZYNQMP_PM_RESET_SATA = 1016,
+	ZYNQMP_PM_RESET_ACPU3_PWRON = 1017,
+	ZYNQMP_PM_RESET_ACPU2_PWRON = 1018,
+	ZYNQMP_PM_RESET_ACPU1_PWRON = 1019,
+	ZYNQMP_PM_RESET_ACPU0_PWRON = 1020,
+	ZYNQMP_PM_RESET_APU_L2 = 1021,
+	ZYNQMP_PM_RESET_ACPU3 = 1022,
+	ZYNQMP_PM_RESET_ACPU2 = 1023,
+	ZYNQMP_PM_RESET_ACPU1 = 1024,
+	ZYNQMP_PM_RESET_ACPU0 = 1025,
+	ZYNQMP_PM_RESET_DDR = 1026,
+	ZYNQMP_PM_RESET_APM_FPD = 1027,
+	ZYNQMP_PM_RESET_SOFT = 1028,
+	ZYNQMP_PM_RESET_GEM0 = 1029,
+	ZYNQMP_PM_RESET_GEM1 = 1030,
+	ZYNQMP_PM_RESET_GEM2 = 1031,
+	ZYNQMP_PM_RESET_GEM3 = 1032,
+	ZYNQMP_PM_RESET_QSPI = 1033,
+	ZYNQMP_PM_RESET_UART0 = 1034,
+	ZYNQMP_PM_RESET_UART1 = 1035,
+	ZYNQMP_PM_RESET_SPI0 = 1036,
+	ZYNQMP_PM_RESET_SPI1 = 1037,
+	ZYNQMP_PM_RESET_SDIO0 = 1038,
+	ZYNQMP_PM_RESET_SDIO1 = 1039,
+	ZYNQMP_PM_RESET_CAN0 = 1040,
+	ZYNQMP_PM_RESET_CAN1 = 1041,
+	ZYNQMP_PM_RESET_I2C0 = 1042,
+	ZYNQMP_PM_RESET_I2C1 = 1043,
+	ZYNQMP_PM_RESET_TTC0 = 1044,
+	ZYNQMP_PM_RESET_TTC1 = 1045,
+	ZYNQMP_PM_RESET_TTC2 = 1046,
+	ZYNQMP_PM_RESET_TTC3 = 1047,
+	ZYNQMP_PM_RESET_SWDT_CRL = 1048,
+	ZYNQMP_PM_RESET_NAND = 1049,
+	ZYNQMP_PM_RESET_ADMA = 1050,
+	ZYNQMP_PM_RESET_GPIO = 1051,
+	ZYNQMP_PM_RESET_IOU_CC = 1052,
+	ZYNQMP_PM_RESET_TIMESTAMP = 1053,
+	ZYNQMP_PM_RESET_RPU_R50 = 1054,
+	ZYNQMP_PM_RESET_RPU_R51 = 1055,
+	ZYNQMP_PM_RESET_RPU_AMBA = 1056,
+	ZYNQMP_PM_RESET_OCM = 1057,
+	ZYNQMP_PM_RESET_RPU_PGE = 1058,
+	ZYNQMP_PM_RESET_USB0_CORERESET = 1059,
+	ZYNQMP_PM_RESET_USB1_CORERESET = 1060,
+	ZYNQMP_PM_RESET_USB0_HIBERRESET = 1061,
+	ZYNQMP_PM_RESET_USB1_HIBERRESET = 1062,
+	ZYNQMP_PM_RESET_USB0_APB = 1063,
+	ZYNQMP_PM_RESET_USB1_APB = 1064,
+	ZYNQMP_PM_RESET_IPI = 1065,
+	ZYNQMP_PM_RESET_APM_LPD = 1066,
+	ZYNQMP_PM_RESET_RTC = 1067,
+	ZYNQMP_PM_RESET_SYSMON = 1068,
+	ZYNQMP_PM_RESET_AFI_FM6 = 1069,
+	ZYNQMP_PM_RESET_LPD_SWDT = 1070,
+	ZYNQMP_PM_RESET_FPD = 1071,
+	ZYNQMP_PM_RESET_RPU_DBG1 = 1072,
+	ZYNQMP_PM_RESET_RPU_DBG0 = 1073,
+	ZYNQMP_PM_RESET_DBG_LPD = 1074,
+	ZYNQMP_PM_RESET_DBG_FPD = 1075,
+	ZYNQMP_PM_RESET_APLL = 1076,
+	ZYNQMP_PM_RESET_DPLL = 1077,
+	ZYNQMP_PM_RESET_VPLL = 1078,
+	ZYNQMP_PM_RESET_IOPLL = 1079,
+	ZYNQMP_PM_RESET_RPLL = 1080,
+	ZYNQMP_PM_RESET_GPO3_PL_0 = 1081,
+	ZYNQMP_PM_RESET_GPO3_PL_1 = 1082,
+	ZYNQMP_PM_RESET_GPO3_PL_2 = 1083,
+	ZYNQMP_PM_RESET_GPO3_PL_3 = 1084,
+	ZYNQMP_PM_RESET_GPO3_PL_4 = 1085,
+	ZYNQMP_PM_RESET_GPO3_PL_5 = 1086,
+	ZYNQMP_PM_RESET_GPO3_PL_6 = 1087,
+	ZYNQMP_PM_RESET_GPO3_PL_7 = 1088,
+	ZYNQMP_PM_RESET_GPO3_PL_8 = 1089,
+	ZYNQMP_PM_RESET_GPO3_PL_9 = 1090,
+	ZYNQMP_PM_RESET_GPO3_PL_10 = 1091,
+	ZYNQMP_PM_RESET_GPO3_PL_11 = 1092,
+	ZYNQMP_PM_RESET_GPO3_PL_12 = 1093,
+	ZYNQMP_PM_RESET_GPO3_PL_13 = 1094,
+	ZYNQMP_PM_RESET_GPO3_PL_14 = 1095,
+	ZYNQMP_PM_RESET_GPO3_PL_15 = 1096,
+	ZYNQMP_PM_RESET_GPO3_PL_16 = 1097,
+	ZYNQMP_PM_RESET_GPO3_PL_17 = 1098,
+	ZYNQMP_PM_RESET_GPO3_PL_18 = 1099,
+	ZYNQMP_PM_RESET_GPO3_PL_19 = 1100,
+	ZYNQMP_PM_RESET_GPO3_PL_20 = 1101,
+	ZYNQMP_PM_RESET_GPO3_PL_21 = 1102,
+	ZYNQMP_PM_RESET_GPO3_PL_22 = 1103,
+	ZYNQMP_PM_RESET_GPO3_PL_23 = 1104,
+	ZYNQMP_PM_RESET_GPO3_PL_24 = 1105,
+	ZYNQMP_PM_RESET_GPO3_PL_25 = 1106,
+	ZYNQMP_PM_RESET_GPO3_PL_26 = 1107,
+	ZYNQMP_PM_RESET_GPO3_PL_27 = 1108,
+	ZYNQMP_PM_RESET_GPO3_PL_28 = 1109,
+	ZYNQMP_PM_RESET_GPO3_PL_29 = 1110,
+	ZYNQMP_PM_RESET_GPO3_PL_30 = 1111,
+	ZYNQMP_PM_RESET_GPO3_PL_31 = 1112,
+	ZYNQMP_PM_RESET_RPU_LS = 1113,
+	ZYNQMP_PM_RESET_PS_ONLY = 1114,
+	ZYNQMP_PM_RESET_PL = 1115,
+	ZYNQMP_PM_RESET_PS_PL0 = 1116,
+	ZYNQMP_PM_RESET_PS_PL1 = 1117,
+	ZYNQMP_PM_RESET_PS_PL2 = 1118,
+	ZYNQMP_PM_RESET_PS_PL3 = 1119,
 	ZYNQMP_PM_RESET_END = ZYNQMP_PM_RESET_PS_PL3
 };
 
 enum zynqmp_pm_suspend_reason {
 	SUSPEND_POWER_REQUEST = 201,
-	SUSPEND_ALERT,
-	SUSPEND_SYSTEM_SHUTDOWN,
+	SUSPEND_ALERT = 202,
+	SUSPEND_SYSTEM_SHUTDOWN = 203,
 };
 
 enum zynqmp_pm_request_ack {
 	ZYNQMP_PM_REQUEST_ACK_NO = 1,
-	ZYNQMP_PM_REQUEST_ACK_BLOCKING,
-	ZYNQMP_PM_REQUEST_ACK_NON_BLOCKING,
+	ZYNQMP_PM_REQUEST_ACK_BLOCKING = 2,
+	ZYNQMP_PM_REQUEST_ACK_NON_BLOCKING = 3,
 };
 
 enum pm_node_id {
 	NODE_SD_0 = 39,
-	NODE_SD_1,
+	NODE_SD_1 = 40,
 };
 
 enum tap_delay_type {
 	PM_TAPDELAY_INPUT = 0,
-	PM_TAPDELAY_OUTPUT,
+	PM_TAPDELAY_OUTPUT = 1,
 };
 
 enum dll_reset_type {
-	PM_DLL_RESET_ASSERT,
-	PM_DLL_RESET_RELEASE,
-	PM_DLL_RESET_PULSE,
+	PM_DLL_RESET_ASSERT = 0,
+	PM_DLL_RESET_RELEASE = 1,
+	PM_DLL_RESET_PULSE = 2,
 };
 
 enum zynqmp_pm_shutdown_type {
-	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN,
-	ZYNQMP_PM_SHUTDOWN_TYPE_RESET,
-	ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY,
+	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0,
+	ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1,
+	ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY = 2,
 };
 
 enum zynqmp_pm_shutdown_subtype {
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM,
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY,
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM = 0,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY = 1,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM = 2,
 };
 
 /**
-- 
cgit v1.2.3


From 2adc75fba3289455b9c4349dd6b95cfb7167b7ea Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Wed, 27 Jan 2021 22:23:29 +0100
Subject: vme: make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct bus_type::remove()
because there is only little that can be done. To simplify the quest to
make this function return void, let struct vme_driver::remove return void,
too. There is only a single vme driver and it already returns 0
unconditionally in .remove().

Also fix the bus remove function to always return 0.

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210127212329.98517-1-uwe@kleine-koenig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/vme/devices/vme_user.c | 4 +---
 drivers/vme/vme.c                      | 4 ++--
 include/linux/vme.h                    | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c
index fd0ea4dbcb91..35d7260e2271 100644
--- a/drivers/staging/vme/devices/vme_user.c
+++ b/drivers/staging/vme/devices/vme_user.c
@@ -689,7 +689,7 @@ err_dev:
 	return err;
 }
 
-static int vme_user_remove(struct vme_dev *dev)
+static void vme_user_remove(struct vme_dev *dev)
 {
 	int i;
 
@@ -717,8 +717,6 @@ static int vme_user_remove(struct vme_dev *dev)
 
 	/* Unregister the major and minor device numbers */
 	unregister_chrdev_region(MKDEV(VME_MAJOR, 0), VME_DEVS);
-
-	return 0;
 }
 
 static struct vme_driver vme_user_driver = {
diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
index 54d7963c1078..1b15afea28ee 100644
--- a/drivers/vme/vme.c
+++ b/drivers/vme/vme.c
@@ -1997,9 +1997,9 @@ static int vme_bus_remove(struct device *dev)
 
 	driver = dev->platform_data;
 	if (driver->remove)
-		return driver->remove(vdev);
+		driver->remove(vdev);
 
-	return -ENODEV;
+	return 0;
 }
 
 struct bus_type vme_bus_type = {
diff --git a/include/linux/vme.h b/include/linux/vme.h
index 7e82bf500f01..b204a9b4be1b 100644
--- a/include/linux/vme.h
+++ b/include/linux/vme.h
@@ -122,7 +122,7 @@ struct vme_driver {
 	const char *name;
 	int (*match)(struct vme_dev *);
 	int (*probe)(struct vme_dev *);
-	int (*remove)(struct vme_dev *);
+	void (*remove)(struct vme_dev *);
 	struct device_driver driver;
 	struct list_head devices;
 };
-- 
cgit v1.2.3


From 5c3db63abdb08d8f0ec2c609f7789a199d5c476f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 5 Feb 2021 18:06:08 +0100
Subject: device.h: Remove bogus "the" in kerneldoc

Remove the bogus word "the" from "...once the it is..." in the
documentation describing the "dev_groups" member of the device_driver
structure.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20210205170608.1956223-1-geert@linux-m68k.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index ee7ba5b5417e..a498ebcf4993 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -75,7 +75,7 @@ enum probe_type {
  * @resume:	Called to bring a device from sleep mode.
  * @groups:	Default attributes that get created by the driver core
  *		automatically.
- * @dev_groups:	Additional attributes attached to device instance once the
+ * @dev_groups:	Additional attributes attached to device instance once
  *		it is bound to the driver.
  * @pm:		Power management operations of the device which matched
  *		this driver.
-- 
cgit v1.2.3


From 0566752c3e8681ec47fee37374cb38081d801e95 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 7 Feb 2021 14:05:43 +0100
Subject: uapi: map_to_7segment: Update example in documentation

The device_attribute .show() and .store() methods gained an extra
parameter in v2.6.13, but the example in the documentation for the
7-segment header file was never updated.  Add the missing parameters.

While at it, get rid of the (misspelled) deprecated symbolic
permissions, and switch to DEVICE_ATTR_RW(), which was introduced in
v3.11

Fixes: 54b6f35c99974e99 ("[PATCH] Driver core: change device_attribute callbacks")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20210207130543.2128980-1-geert@linux-m68k.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/map_to_7segment.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h
index 13a06e5e966e..8b02088f96e3 100644
--- a/include/uapi/linux/map_to_7segment.h
+++ b/include/uapi/linux/map_to_7segment.h
@@ -45,17 +45,22 @@
  * In device drivers it is recommended, if required, to make the char map
  * accessible via the sysfs interface using the following scheme:
  *
- * static ssize_t show_map(struct device *dev, char *buf) {
+ * static ssize_t map_seg7_show(struct device *dev,
+ *				struct device_attribute *attr, char *buf)
+ * {
  *	memcpy(buf, &map_seg7, sizeof(map_seg7));
  *	return sizeof(map_seg7);
  * }
- * static ssize_t store_map(struct device *dev, const char *buf, size_t cnt) {
+ * static ssize_t map_seg7_store(struct device *dev,
+ *				 struct device_attribute *attr, const char *buf,
+ *				 size_t cnt)
+ * {
  *	if(cnt != sizeof(map_seg7))
  *		return -EINVAL;
  *	memcpy(&map_seg7, buf, cnt);
  *	return cnt;
  * }
- * static DEVICE_ATTR(map_seg7, PERMS_RW, show_map, store_map);
+ * static DEVICE_ATTR_RW(map_seg7);
  *
  * History:
  * 2005-05-31	RFC linux-kernel@vger.kernel.org
-- 
cgit v1.2.3


From 9528e0d9c10027ae80e2aab36e30a1f730b1bbf9 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:37 -0800
Subject: driver core: fw_devlink: Detect supplier devices that will never be
 added

During the initial parsing of firmware by fw_devlink, fw_devlink might
infer that some supplier firmware nodes would get populated as devices.
But the inference is not always correct. This patch tries to logically
detect and fix such mistakes as boot progresses or more devices probe.

fw_devlink makes a fundamental assumption that once a device binds to a
driver, it will populate (i.e: add as struct devices) all the child
firmware nodes that could be populated as devices (if they aren't
populated already).

So, whenever a device probes, we check all its child firmware nodes. If
a child firmware node has a corresponding device populated, we don't
modify the child node or its descendants. However, if a child firmware
node has not been populated as a device, we delete all the fwnode links
where the child node or its descendants are suppliers. This ensures that
no other device is blocked on a firmware node that will never be
populated as a device. We also mark such fwnodes as NOT_DEVICE, so that
no new fwnode links are created with these nodes as suppliers.

Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 31 ++++++++++++++++++++++++++++---
 include/linux/fwnode.h |  2 ++
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 484a942884ba..c95b1daabac7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -148,6 +148,21 @@ void fwnode_links_purge(struct fwnode_handle *fwnode)
 	fwnode_links_purge_consumers(fwnode);
 }
 
+static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *child;
+
+	/* Don't purge consumer links of an added child */
+	if (fwnode->dev)
+		return;
+
+	fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
+	fwnode_links_purge_consumers(fwnode);
+
+	fwnode_for_each_available_child_node(fwnode, child)
+		fw_devlink_purge_absent_suppliers(child);
+}
+
 #ifdef CONFIG_SRCU
 static DEFINE_MUTEX(device_links_lock);
 DEFINE_STATIC_SRCU(device_links_srcu);
@@ -1154,12 +1169,22 @@ void device_links_driver_bound(struct device *dev)
 	LIST_HEAD(sync_list);
 
 	/*
-	 * If a device probes successfully, it's expected to have created all
+	 * If a device binds successfully, it's expected to have created all
 	 * the device links it needs to or make new device links as it needs
-	 * them. So, it no longer needs to wait on any suppliers.
+	 * them. So, fw_devlink no longer needs to create device links to any
+	 * of the device's suppliers.
+	 *
+	 * Also, if a child firmware node of this bound device is not added as
+	 * a device by now, assume it is never going to be added and make sure
+	 * other devices don't defer probe indefinitely by waiting for such a
+	 * child device.
 	 */
-	if (dev->fwnode && dev->fwnode->dev == dev)
+	if (dev->fwnode && dev->fwnode->dev == dev) {
+		struct fwnode_handle *child;
 		fwnode_links_purge_suppliers(dev->fwnode);
+		fwnode_for_each_available_child_node(dev->fwnode, child)
+			fw_devlink_purge_absent_suppliers(child);
+	}
 	device_remove_file(dev, &dev_attr_waiting_for_supplier);
 
 	device_links_write_lock();
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index fde4ad97564c..21082f11473f 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -19,8 +19,10 @@ struct device;
  * fwnode link flags
  *
  * LINKS_ADDED: The fwnode has already be parsed to add fwnode links.
+ * NOT_DEVICE: The fwnode will never be populated as a struct device.
  */
 #define FWNODE_FLAG_LINKS_ADDED		BIT(0)
+#define FWNODE_FLAG_NOT_DEVICE		BIT(1)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
-- 
cgit v1.2.3


From 19d0f5f6bff878277783fd98fef4ae2441d6a1d8 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:39 -0800
Subject: driver core: Add fw_devlink.strict kernel param

This param allows forcing all dependencies to be treated as mandatory.
This will be useful for boards in which all optional dependencies like
IOMMUs and DMAs need to be treated as mandatory dependencies.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-4-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +++++
 drivers/base/core.c                             | 12 ++++++++++++
 include/linux/fwnode.h                          |  1 +
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a10b545c2070..692b63644133 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1433,6 +1433,11 @@
 				to enforce probe and suspend/resume ordering.
 			rpm --	Like "on", but also use to order runtime PM.
 
+	fw_devlink.strict=<bool>
+			[KNL] Treat all inferred dependencies as mandatory
+			dependencies. This only applies for fw_devlink=on|rpm.
+			Format: <bool>
+
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			support via parallel port (up to 5 devices per port)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index c95b1daabac7..f466ab4f1c35 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1521,6 +1521,13 @@ static int __init fw_devlink_setup(char *arg)
 }
 early_param("fw_devlink", fw_devlink_setup);
 
+static bool fw_devlink_strict;
+static int __init fw_devlink_strict_setup(char *arg)
+{
+	return strtobool(arg, &fw_devlink_strict);
+}
+early_param("fw_devlink.strict", fw_devlink_strict_setup);
+
 u32 fw_devlink_get_flags(void)
 {
 	return fw_devlink_flags;
@@ -1531,6 +1538,11 @@ static bool fw_devlink_is_permissive(void)
 	return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
 }
 
+bool fw_devlink_is_strict(void)
+{
+	return fw_devlink_strict && !fw_devlink_is_permissive();
+}
+
 static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
 {
 	if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 21082f11473f..d5caefe39d93 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -162,6 +162,7 @@ static inline void fwnode_init(struct fwnode_handle *fwnode,
 }
 
 extern u32 fw_devlink_get_flags(void);
+extern bool fw_devlink_is_strict(void);
 int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
 void fwnode_links_purge(struct fwnode_handle *fwnode);
 
-- 
cgit v1.2.3


From 74c782cff77b3533290148df1fa6f8c7db5e60d5 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:41 -0800
Subject: driver core: fw_devlink: Handle suppliers that don't use driver core

Device links only work between devices that use the driver core to match
and bind a driver to a device. So, add an API for frameworks to let the
driver core know that a fwnode has been initialized by a driver without
using the driver core.

Then use this information to make sure that fw_devlink doesn't make the
consumers wait indefinitely on suppliers that'll never bind to a driver.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-6-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 15 +++++++++++++++
 include/linux/fwnode.h | 19 +++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f466ab4f1c35..ea710b33bda6 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1636,6 +1636,17 @@ static int fw_devlink_create_devlink(struct device *con,
 
 	sup_dev = get_dev_from_fwnode(sup_handle);
 	if (sup_dev) {
+		/*
+		 * If it's one of those drivers that don't actually bind to
+		 * their device using driver core, then don't wait on this
+		 * supplier device indefinitely.
+		 */
+		if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
+		    sup_handle->flags & FWNODE_FLAG_INITIALIZED) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * If this fails, it is due to cycles in device links.  Just
 		 * give up on this link and treat it as invalid.
@@ -1655,6 +1666,10 @@ static int fw_devlink_create_devlink(struct device *con,
 		goto out;
 	}
 
+	/* Supplier that's already initialized without a struct device. */
+	if (sup_handle->flags & FWNODE_FLAG_INITIALIZED)
+		return -EINVAL;
+
 	/*
 	 * DL_FLAG_SYNC_STATE_ONLY doesn't block probing and supports
 	 * cycles. So cycle detection isn't necessary and shouldn't be
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index d5caefe39d93..dfefd43a737c 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/err.h>
 
 struct fwnode_operations;
 struct device;
@@ -18,11 +19,13 @@ struct device;
 /*
  * fwnode link flags
  *
- * LINKS_ADDED: The fwnode has already be parsed to add fwnode links.
- * NOT_DEVICE: The fwnode will never be populated as a struct device.
+ * LINKS_ADDED:	The fwnode has already be parsed to add fwnode links.
+ * NOT_DEVICE:	The fwnode will never be populated as a struct device.
+ * INITIALIZED: The hardware corresponding to fwnode has been initialized.
  */
 #define FWNODE_FLAG_LINKS_ADDED		BIT(0)
 #define FWNODE_FLAG_NOT_DEVICE		BIT(1)
+#define FWNODE_FLAG_INITIALIZED		BIT(2)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
@@ -161,6 +164,18 @@ static inline void fwnode_init(struct fwnode_handle *fwnode,
 	INIT_LIST_HEAD(&fwnode->suppliers);
 }
 
+static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode,
+					  bool initialized)
+{
+	if (IS_ERR_OR_NULL(fwnode))
+		return;
+
+	if (initialized)
+		fwnode->flags |= FWNODE_FLAG_INITIALIZED;
+	else
+		fwnode->flags &= ~FWNODE_FLAG_INITIALIZED;
+}
+
 extern u32 fw_devlink_get_flags(void);
 extern bool fw_devlink_is_strict(void);
 int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
-- 
cgit v1.2.3


From ef3c67b6454b8f542f50387ad481633ae30874ac Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Jan 2021 21:39:48 +0200
Subject: mfd: intel_msic: Remove driver for deprecated platform

Intel Moorestown and Medfield are quite old Intel Atom based
32-bit platforms, which were in limited use in some Android phones,
tablets and consumer electronics more than eight years ago.

There are no bugs or problems ever reported outside from Intel
for breaking any of that platforms for years. It seems no real
users exists who run more or less fresh kernel on it. Commit
05f4434bc130 ("ASoC: Intel: remove mfld_machine") is also in align
with this theory.

Due to above and to reduce a burden of supporting outdated drivers,
remove the support for outdated platforms completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                                 |   2 -
 arch/x86/include/asm/intel_scu_ipc_legacy.h |  12 -
 drivers/mfd/Kconfig                         |   9 -
 drivers/mfd/Makefile                        |   1 -
 drivers/mfd/intel_msic.c                    | 425 --------------------------
 include/linux/mfd/intel_msic.h              | 453 ----------------------------
 6 files changed, 902 deletions(-)
 delete mode 100644 drivers/mfd/intel_msic.c
 delete mode 100644 include/linux/mfd/intel_msic.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 904eaa52f5f8..8fdb7fe27537 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9098,9 +9098,7 @@ F:	drivers/gpio/gpio-*cove.c
 INTEL PMIC MULTIFUNCTION DEVICE DRIVERS
 M:	Andy Shevchenko <andy@kernel.org>
 S:	Maintained
-F:	drivers/mfd/intel_msic.c
 F:	drivers/mfd/intel_soc_pmic*
-F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
 INTEL PMT DRIVER
diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h
index 2232197c24f8..f8b87d8face7 100644
--- a/arch/x86/include/asm/intel_scu_ipc_legacy.h
+++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h
@@ -16,24 +16,12 @@
 
 /* Don't call these in new code - they will be removed eventually */
 
-/* Read single register */
-static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data)
-{
-	return intel_scu_ipc_dev_ioread8(NULL, addr, data);
-}
-
 /* Read a vector */
 static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
 {
 	return intel_scu_ipc_dev_readv(NULL, addr, data, len);
 }
 
-/* Write single register */
-static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data)
-{
-	return intel_scu_ipc_dev_iowrite8(NULL, addr, data);
-}
-
 /* Write a vector */
 static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
 {
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index bdfce7b15621..2dc58a92251b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -659,15 +659,6 @@ config MFD_INTEL_LPSS_PCI
 	  I2C, SPI and HS-UART starting from Intel Sunrisepoint (Intel Skylake
 	  PCH) in PCI mode.
 
-config MFD_INTEL_MSIC
-	bool "Intel MSIC"
-	depends on INTEL_SCU
-	select MFD_CORE
-	help
-	  Select this option to enable access to Intel MSIC (Avatele
-	  Passage) chip. This chip embeds audio, battery, GPIO, etc.
-	  devices used in Intel Medfield platforms.
-
 config MFD_INTEL_PMC_BXT
 	tristate "Intel PMC Driver for Broxton"
 	depends on X86
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 14fdb188af02..be19bc799073 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -214,7 +214,6 @@ obj-$(CONFIG_MFD_ATMEL_SMC)	+= atmel-smc.o
 obj-$(CONFIG_MFD_INTEL_LPSS)	+= intel-lpss.o
 obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
-obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
 obj-$(CONFIG_MFD_INTEL_PMC_BXT)	+= intel_pmc_bxt.o
 obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
diff --git a/drivers/mfd/intel_msic.c b/drivers/mfd/intel_msic.c
deleted file mode 100644
index daa772f8146b..000000000000
--- a/drivers/mfd/intel_msic.c
+++ /dev/null
@@ -1,425 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Driver for Intel MSIC
- *
- * Copyright (C) 2011, Intel Corporation
- * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- */
-
-#include <linux/err.h>
-#include <linux/gpio.h>
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/mfd/core.h>
-#include <linux/mfd/intel_msic.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-#include <asm/intel_scu_ipc.h>
-
-#define MSIC_VENDOR(id)		((id >> 6) & 3)
-#define MSIC_VERSION(id)	(id & 0x3f)
-#define MSIC_MAJOR(id)		('A' + ((id >> 3) & 7))
-#define MSIC_MINOR(id)		(id & 7)
-
-/*
- * MSIC interrupt tree is readable from SRAM at INTEL_MSIC_IRQ_PHYS_BASE.
- * Since IRQ block starts from address 0x002 we need to subtract that from
- * the actual IRQ status register address.
- */
-#define MSIC_IRQ_STATUS(x)	(INTEL_MSIC_IRQ_PHYS_BASE + ((x) - 2))
-#define MSIC_IRQ_STATUS_ACCDET	MSIC_IRQ_STATUS(INTEL_MSIC_ACCDET)
-
-/*
- * The SCU hardware has limitation of 16 bytes per read/write buffer on
- * Medfield.
- */
-#define SCU_IPC_RWBUF_LIMIT	16
-
-/**
- * struct intel_msic - an MSIC MFD instance
- * @pdev: pointer to the platform device
- * @vendor: vendor ID
- * @version: chip version
- * @irq_base: base address of the mapped MSIC SRAM interrupt tree
- */
-struct intel_msic {
-	struct platform_device		*pdev;
-	unsigned			vendor;
-	unsigned			version;
-	void __iomem			*irq_base;
-};
-
-static const struct resource msic_touch_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_adc_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_battery_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_gpio_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_audio_resources[] = {
-	DEFINE_RES_IRQ_NAMED(0, "IRQ"),
-	/*
-	 * We will pass IRQ_BASE to the driver now but this can be removed
-	 * when/if the driver starts to use intel_msic_irq_read().
-	 */
-	DEFINE_RES_MEM_NAMED(MSIC_IRQ_STATUS_ACCDET, 1, "IRQ_BASE"),
-};
-
-static const struct resource msic_hdmi_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_thermal_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_power_btn_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_ocd_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-/*
- * Devices that are part of the MSIC and are available via firmware
- * populated SFI DEVS table.
- */
-static struct mfd_cell msic_devs[] = {
-	[INTEL_MSIC_BLOCK_TOUCH]	= {
-		.name			= "msic_touch",
-		.num_resources		= ARRAY_SIZE(msic_touch_resources),
-		.resources		= msic_touch_resources,
-	},
-	[INTEL_MSIC_BLOCK_ADC]		= {
-		.name			= "msic_adc",
-		.num_resources		= ARRAY_SIZE(msic_adc_resources),
-		.resources		= msic_adc_resources,
-	},
-	[INTEL_MSIC_BLOCK_BATTERY]	= {
-		.name			= "msic_battery",
-		.num_resources		= ARRAY_SIZE(msic_battery_resources),
-		.resources		= msic_battery_resources,
-	},
-	[INTEL_MSIC_BLOCK_GPIO]		= {
-		.name			= "msic_gpio",
-		.num_resources		= ARRAY_SIZE(msic_gpio_resources),
-		.resources		= msic_gpio_resources,
-	},
-	[INTEL_MSIC_BLOCK_AUDIO]	= {
-		.name			= "msic_audio",
-		.num_resources		= ARRAY_SIZE(msic_audio_resources),
-		.resources		= msic_audio_resources,
-	},
-	[INTEL_MSIC_BLOCK_HDMI]		= {
-		.name			= "msic_hdmi",
-		.num_resources		= ARRAY_SIZE(msic_hdmi_resources),
-		.resources		= msic_hdmi_resources,
-	},
-	[INTEL_MSIC_BLOCK_THERMAL]	= {
-		.name			= "msic_thermal",
-		.num_resources		= ARRAY_SIZE(msic_thermal_resources),
-		.resources		= msic_thermal_resources,
-	},
-	[INTEL_MSIC_BLOCK_POWER_BTN]	= {
-		.name			= "msic_power_btn",
-		.num_resources		= ARRAY_SIZE(msic_power_btn_resources),
-		.resources		= msic_power_btn_resources,
-	},
-	[INTEL_MSIC_BLOCK_OCD]		= {
-		.name			= "msic_ocd",
-		.num_resources		= ARRAY_SIZE(msic_ocd_resources),
-		.resources		= msic_ocd_resources,
-	},
-};
-
-/*
- * Other MSIC related devices which are not directly available via SFI DEVS
- * table. These can be pseudo devices, regulators etc. which are needed for
- * different purposes.
- *
- * These devices appear only after the MSIC driver itself is initialized so
- * we can guarantee that the SCU IPC interface is ready.
- */
-static const struct mfd_cell msic_other_devs[] = {
-	/* Audio codec in the MSIC */
-	{
-		.id			= -1,
-		.name			= "sn95031",
-	},
-};
-
-/**
- * intel_msic_reg_read - read a single MSIC register
- * @reg: register to read
- * @val: register value is placed here
- *
- * Read a single register from MSIC. Returns %0 on success and negative
- * errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_read(unsigned short reg, u8 *val)
-{
-	return intel_scu_ipc_ioread8(reg, val);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_read);
-
-/**
- * intel_msic_reg_write - write a single MSIC register
- * @reg: register to write
- * @val: value to write to that register
- *
- * Write a single MSIC register. Returns 0 on success and negative
- * errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_write(unsigned short reg, u8 val)
-{
-	return intel_scu_ipc_iowrite8(reg, val);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_write);
-
-/**
- * intel_msic_reg_update - update a single MSIC register
- * @reg: register to update
- * @val: value to write to the register
- * @mask: specifies which of the bits are updated (%0 = don't update,
- *        %1 = update)
- *
- * Perform an update to a register @reg. @mask is used to specify which
- * bits are updated. Returns %0 in case of success and negative errno in
- * case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_update(unsigned short reg, u8 val, u8 mask)
-{
-	return intel_scu_ipc_update_register(reg, val, mask);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_update);
-
-/**
- * intel_msic_bulk_read - read an array of registers
- * @reg: array of register addresses to read
- * @buf: array where the read values are placed
- * @count: number of registers to read
- *
- * Function reads @count registers from the MSIC using addresses passed in
- * @reg. Read values are placed in @buf. Reads are performed atomically
- * wrt. MSIC.
- *
- * Returns %0 in case of success and negative errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_bulk_read(unsigned short *reg, u8 *buf, size_t count)
-{
-	if (WARN_ON(count > SCU_IPC_RWBUF_LIMIT))
-		return -EINVAL;
-
-	return intel_scu_ipc_readv(reg, buf, count);
-}
-EXPORT_SYMBOL_GPL(intel_msic_bulk_read);
-
-/**
- * intel_msic_bulk_write - write an array of values to the MSIC registers
- * @reg: array of registers to write
- * @buf: values to write to each register
- * @count: number of registers to write
- *
- * Function writes @count registers in @buf to MSIC. Writes are performed
- * atomically wrt MSIC. Returns %0 in case of success and negative errno in
- * case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_bulk_write(unsigned short *reg, u8 *buf, size_t count)
-{
-	if (WARN_ON(count > SCU_IPC_RWBUF_LIMIT))
-		return -EINVAL;
-
-	return intel_scu_ipc_writev(reg, buf, count);
-}
-EXPORT_SYMBOL_GPL(intel_msic_bulk_write);
-
-/**
- * intel_msic_irq_read - read a register from an MSIC interrupt tree
- * @msic: MSIC instance
- * @reg: interrupt register (between %INTEL_MSIC_IRQLVL1 and
- *	 %INTEL_MSIC_RESETIRQ2)
- * @val: value of the register is placed here
- *
- * This function can be used by an MSIC subdevice interrupt handler to read
- * a register value from the MSIC interrupt tree. In this way subdevice
- * drivers don't have to map in the interrupt tree themselves but can just
- * call this function instead.
- *
- * Function doesn't sleep and is callable from interrupt context.
- *
- * Returns %-EINVAL if @reg is outside of the allowed register region.
- */
-int intel_msic_irq_read(struct intel_msic *msic, unsigned short reg, u8 *val)
-{
-	if (WARN_ON(reg < INTEL_MSIC_IRQLVL1 || reg > INTEL_MSIC_RESETIRQ2))
-		return -EINVAL;
-
-	*val = readb(msic->irq_base + (reg - INTEL_MSIC_IRQLVL1));
-	return 0;
-}
-EXPORT_SYMBOL_GPL(intel_msic_irq_read);
-
-static int intel_msic_init_devices(struct intel_msic *msic)
-{
-	struct platform_device *pdev = msic->pdev;
-	struct intel_msic_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	int ret, i;
-
-	if (pdata->gpio) {
-		struct mfd_cell *cell = &msic_devs[INTEL_MSIC_BLOCK_GPIO];
-
-		cell->platform_data = pdata->gpio;
-		cell->pdata_size = sizeof(*pdata->gpio);
-	}
-
-	if (pdata->ocd) {
-		unsigned gpio = pdata->ocd->gpio;
-
-		ret = devm_gpio_request_one(&pdev->dev, gpio,
-					GPIOF_IN, "ocd_gpio");
-		if (ret) {
-			dev_err(&pdev->dev, "failed to register OCD GPIO\n");
-			return ret;
-		}
-
-		ret = gpio_to_irq(gpio);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "no IRQ number for OCD GPIO\n");
-			return ret;
-		}
-
-		/* Update the IRQ number for the OCD */
-		pdata->irq[INTEL_MSIC_BLOCK_OCD] = ret;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(msic_devs); i++) {
-		if (!pdata->irq[i])
-			continue;
-
-		ret = mfd_add_devices(&pdev->dev, -1, &msic_devs[i], 1, NULL,
-				      pdata->irq[i], NULL);
-		if (ret)
-			goto fail;
-	}
-
-	ret = mfd_add_devices(&pdev->dev, 0, msic_other_devs,
-			      ARRAY_SIZE(msic_other_devs), NULL, 0, NULL);
-	if (ret)
-		goto fail;
-
-	return 0;
-
-fail:
-	mfd_remove_devices(&pdev->dev);
-
-	return ret;
-}
-
-static void intel_msic_remove_devices(struct intel_msic *msic)
-{
-	struct platform_device *pdev = msic->pdev;
-
-	mfd_remove_devices(&pdev->dev);
-}
-
-static int intel_msic_probe(struct platform_device *pdev)
-{
-	struct intel_msic_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct intel_msic *msic;
-	struct resource *res;
-	u8 id0, id1;
-	int ret;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data passed\n");
-		return -EINVAL;
-	}
-
-	/* First validate that we have an MSIC in place */
-	ret = intel_scu_ipc_ioread8(INTEL_MSIC_ID0, &id0);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to identify the MSIC chip (ID0)\n");
-		return -ENXIO;
-	}
-
-	ret = intel_scu_ipc_ioread8(INTEL_MSIC_ID1, &id1);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to identify the MSIC chip (ID1)\n");
-		return -ENXIO;
-	}
-
-	if (MSIC_VENDOR(id0) != MSIC_VENDOR(id1)) {
-		dev_err(&pdev->dev, "invalid vendor ID: %x, %x\n", id0, id1);
-		return -ENXIO;
-	}
-
-	msic = devm_kzalloc(&pdev->dev, sizeof(*msic), GFP_KERNEL);
-	if (!msic)
-		return -ENOMEM;
-
-	msic->vendor = MSIC_VENDOR(id0);
-	msic->version = MSIC_VERSION(id0);
-	msic->pdev = pdev;
-
-	/*
-	 * Map in the MSIC interrupt tree area in SRAM. This is exposed to
-	 * the clients via intel_msic_irq_read().
-	 */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	msic->irq_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(msic->irq_base))
-		return PTR_ERR(msic->irq_base);
-
-	platform_set_drvdata(pdev, msic);
-
-	ret = intel_msic_init_devices(msic);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize MSIC devices\n");
-		return ret;
-	}
-
-	dev_info(&pdev->dev, "Intel MSIC version %c%d (vendor %#x)\n",
-		 MSIC_MAJOR(msic->version), MSIC_MINOR(msic->version),
-		 msic->vendor);
-
-	return 0;
-}
-
-static int intel_msic_remove(struct platform_device *pdev)
-{
-	struct intel_msic *msic = platform_get_drvdata(pdev);
-
-	intel_msic_remove_devices(msic);
-
-	return 0;
-}
-
-static struct platform_driver intel_msic_driver = {
-	.probe		= intel_msic_probe,
-	.remove		= intel_msic_remove,
-	.driver		= {
-		.name	= "intel_msic",
-	},
-};
-builtin_platform_driver(intel_msic_driver);
diff --git a/include/linux/mfd/intel_msic.h b/include/linux/mfd/intel_msic.h
deleted file mode 100644
index 317e8608cf41..000000000000
--- a/include/linux/mfd/intel_msic.h
+++ /dev/null
@@ -1,453 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Core interface for Intel MSIC
- *
- * Copyright (C) 2011, Intel Corporation
- * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- */
-
-#ifndef __LINUX_MFD_INTEL_MSIC_H__
-#define __LINUX_MFD_INTEL_MSIC_H__
-
-/* ID */
-#define INTEL_MSIC_ID0			0x000	/* RO */
-#define INTEL_MSIC_ID1			0x001	/* RO */
-
-/* IRQ */
-#define INTEL_MSIC_IRQLVL1		0x002
-#define INTEL_MSIC_ADC1INT		0x003
-#define INTEL_MSIC_CCINT		0x004
-#define INTEL_MSIC_PWRSRCINT		0x005
-#define INTEL_MSIC_PWRSRCINT1		0x006
-#define INTEL_MSIC_CHRINT		0x007
-#define INTEL_MSIC_CHRINT1		0x008
-#define INTEL_MSIC_RTCIRQ		0x009
-#define INTEL_MSIC_GPIO0LVIRQ		0x00a
-#define INTEL_MSIC_GPIO1LVIRQ		0x00b
-#define INTEL_MSIC_GPIOHVIRQ		0x00c
-#define INTEL_MSIC_VRINT		0x00d
-#define INTEL_MSIC_OCAUDIO		0x00e
-#define INTEL_MSIC_ACCDET		0x00f
-#define INTEL_MSIC_RESETIRQ1		0x010
-#define INTEL_MSIC_RESETIRQ2		0x011
-#define INTEL_MSIC_MADC1INT		0x012
-#define INTEL_MSIC_MCCINT		0x013
-#define INTEL_MSIC_MPWRSRCINT		0x014
-#define INTEL_MSIC_MPWRSRCINT1		0x015
-#define INTEL_MSIC_MCHRINT		0x016
-#define INTEL_MSIC_MCHRINT1		0x017
-#define INTEL_MSIC_RTCIRQMASK		0x018
-#define INTEL_MSIC_GPIO0LVIRQMASK	0x019
-#define INTEL_MSIC_GPIO1LVIRQMASK	0x01a
-#define INTEL_MSIC_GPIOHVIRQMASK	0x01b
-#define INTEL_MSIC_VRINTMASK		0x01c
-#define INTEL_MSIC_OCAUDIOMASK		0x01d
-#define INTEL_MSIC_ACCDETMASK		0x01e
-#define INTEL_MSIC_RESETIRQ1MASK	0x01f
-#define INTEL_MSIC_RESETIRQ2MASK	0x020
-#define INTEL_MSIC_IRQLVL1MSK		0x021
-#define INTEL_MSIC_PBCONFIG		0x03e
-#define INTEL_MSIC_PBSTATUS		0x03f	/* RO */
-
-/* GPIO */
-#define INTEL_MSIC_GPIO0LV7CTLO		0x040
-#define INTEL_MSIC_GPIO0LV6CTLO		0x041
-#define INTEL_MSIC_GPIO0LV5CTLO		0x042
-#define INTEL_MSIC_GPIO0LV4CTLO		0x043
-#define INTEL_MSIC_GPIO0LV3CTLO		0x044
-#define INTEL_MSIC_GPIO0LV2CTLO		0x045
-#define INTEL_MSIC_GPIO0LV1CTLO		0x046
-#define INTEL_MSIC_GPIO0LV0CTLO		0x047
-#define INTEL_MSIC_GPIO1LV7CTLOS	0x048
-#define INTEL_MSIC_GPIO1LV6CTLO		0x049
-#define INTEL_MSIC_GPIO1LV5CTLO		0x04a
-#define INTEL_MSIC_GPIO1LV4CTLO		0x04b
-#define INTEL_MSIC_GPIO1LV3CTLO		0x04c
-#define INTEL_MSIC_GPIO1LV2CTLO		0x04d
-#define INTEL_MSIC_GPIO1LV1CTLO		0x04e
-#define INTEL_MSIC_GPIO1LV0CTLO		0x04f
-#define INTEL_MSIC_GPIO0LV7CTLI		0x050
-#define INTEL_MSIC_GPIO0LV6CTLI		0x051
-#define INTEL_MSIC_GPIO0LV5CTLI		0x052
-#define INTEL_MSIC_GPIO0LV4CTLI		0x053
-#define INTEL_MSIC_GPIO0LV3CTLI		0x054
-#define INTEL_MSIC_GPIO0LV2CTLI		0x055
-#define INTEL_MSIC_GPIO0LV1CTLI		0x056
-#define INTEL_MSIC_GPIO0LV0CTLI		0x057
-#define INTEL_MSIC_GPIO1LV7CTLIS	0x058
-#define INTEL_MSIC_GPIO1LV6CTLI		0x059
-#define INTEL_MSIC_GPIO1LV5CTLI		0x05a
-#define INTEL_MSIC_GPIO1LV4CTLI		0x05b
-#define INTEL_MSIC_GPIO1LV3CTLI		0x05c
-#define INTEL_MSIC_GPIO1LV2CTLI		0x05d
-#define INTEL_MSIC_GPIO1LV1CTLI		0x05e
-#define INTEL_MSIC_GPIO1LV0CTLI		0x05f
-#define INTEL_MSIC_PWM0CLKDIV1		0x061
-#define INTEL_MSIC_PWM0CLKDIV0		0x062
-#define INTEL_MSIC_PWM1CLKDIV1		0x063
-#define INTEL_MSIC_PWM1CLKDIV0		0x064
-#define INTEL_MSIC_PWM2CLKDIV1		0x065
-#define INTEL_MSIC_PWM2CLKDIV0		0x066
-#define INTEL_MSIC_PWM0DUTYCYCLE	0x067
-#define INTEL_MSIC_PWM1DUTYCYCLE	0x068
-#define INTEL_MSIC_PWM2DUTYCYCLE	0x069
-#define INTEL_MSIC_GPIO0HV3CTLO		0x06d
-#define INTEL_MSIC_GPIO0HV2CTLO		0x06e
-#define INTEL_MSIC_GPIO0HV1CTLO		0x06f
-#define INTEL_MSIC_GPIO0HV0CTLO		0x070
-#define INTEL_MSIC_GPIO1HV3CTLO		0x071
-#define INTEL_MSIC_GPIO1HV2CTLO		0x072
-#define INTEL_MSIC_GPIO1HV1CTLO		0x073
-#define INTEL_MSIC_GPIO1HV0CTLO		0x074
-#define INTEL_MSIC_GPIO0HV3CTLI		0x075
-#define INTEL_MSIC_GPIO0HV2CTLI		0x076
-#define INTEL_MSIC_GPIO0HV1CTLI		0x077
-#define INTEL_MSIC_GPIO0HV0CTLI		0x078
-#define INTEL_MSIC_GPIO1HV3CTLI		0x079
-#define INTEL_MSIC_GPIO1HV2CTLI		0x07a
-#define INTEL_MSIC_GPIO1HV1CTLI		0x07b
-#define INTEL_MSIC_GPIO1HV0CTLI		0x07c
-
-/* SVID */
-#define INTEL_MSIC_SVIDCTRL0		0x080
-#define INTEL_MSIC_SVIDCTRL1		0x081
-#define INTEL_MSIC_SVIDCTRL2		0x082
-#define INTEL_MSIC_SVIDTXLASTPKT3	0x083	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT2	0x084	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT1	0x085	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT0	0x086	/* RO */
-#define INTEL_MSIC_SVIDPKTOUTBYTE3	0x087
-#define INTEL_MSIC_SVIDPKTOUTBYTE2	0x088
-#define INTEL_MSIC_SVIDPKTOUTBYTE1	0x089
-#define INTEL_MSIC_SVIDPKTOUTBYTE0	0x08a
-#define INTEL_MSIC_SVIDRXVPDEBUG1	0x08b
-#define INTEL_MSIC_SVIDRXVPDEBUG0	0x08c
-#define INTEL_MSIC_SVIDRXLASTPKT3	0x08d	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT2	0x08e	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT1	0x08f	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT0	0x090	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS3	0x091	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS2	0x092	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS1	0x093	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS0	0x094	/* RO */
-
-/* VREG */
-#define INTEL_MSIC_VCCLATCH		0x0c0
-#define INTEL_MSIC_VNNLATCH		0x0c1
-#define INTEL_MSIC_VCCCNT		0x0c2
-#define INTEL_MSIC_SMPSRAMP		0x0c3
-#define INTEL_MSIC_VNNCNT		0x0c4
-#define INTEL_MSIC_VNNAONCNT		0x0c5
-#define INTEL_MSIC_VCC122AONCNT		0x0c6
-#define INTEL_MSIC_V180AONCNT		0x0c7
-#define INTEL_MSIC_V500CNT		0x0c8
-#define INTEL_MSIC_VIHFCNT		0x0c9
-#define INTEL_MSIC_LDORAMP1		0x0ca
-#define INTEL_MSIC_LDORAMP2		0x0cb
-#define INTEL_MSIC_VCC108AONCNT		0x0cc
-#define INTEL_MSIC_VCC108ASCNT		0x0cd
-#define INTEL_MSIC_VCC108CNT		0x0ce
-#define INTEL_MSIC_VCCA100ASCNT		0x0cf
-#define INTEL_MSIC_VCCA100CNT		0x0d0
-#define INTEL_MSIC_VCC180AONCNT		0x0d1
-#define INTEL_MSIC_VCC180CNT		0x0d2
-#define INTEL_MSIC_VCC330CNT		0x0d3
-#define INTEL_MSIC_VUSB330CNT		0x0d4
-#define INTEL_MSIC_VCCSDIOCNT		0x0d5
-#define INTEL_MSIC_VPROG1CNT		0x0d6
-#define INTEL_MSIC_VPROG2CNT		0x0d7
-#define INTEL_MSIC_VEMMCSCNT		0x0d8
-#define INTEL_MSIC_VEMMC1CNT		0x0d9
-#define INTEL_MSIC_VEMMC2CNT		0x0da
-#define INTEL_MSIC_VAUDACNT		0x0db
-#define INTEL_MSIC_VHSPCNT		0x0dc
-#define INTEL_MSIC_VHSNCNT		0x0dd
-#define INTEL_MSIC_VHDMICNT		0x0de
-#define INTEL_MSIC_VOTGCNT		0x0df
-#define INTEL_MSIC_V1P35CNT		0x0e0
-#define INTEL_MSIC_V330AONCNT		0x0e1
-
-/* RESET */
-#define INTEL_MSIC_CHIPCNTRL		0x100	/* WO */
-#define INTEL_MSIC_ERCONFIG		0x101
-
-/* BURST */
-#define INTEL_MSIC_BATCURRENTLIMIT12	0x102
-#define INTEL_MSIC_BATTIMELIMIT12	0x103
-#define INTEL_MSIC_BATTIMELIMIT3	0x104
-#define INTEL_MSIC_BATTIMEDB		0x105
-#define INTEL_MSIC_BRSTCONFIGOUTPUTS	0x106
-#define INTEL_MSIC_BRSTCONFIGACTIONS	0x107
-#define INTEL_MSIC_BURSTCONTROLSTATUS	0x108
-
-/* RTC */
-#define INTEL_MSIC_RTCB1		0x140	/* RO */
-#define INTEL_MSIC_RTCB2		0x141	/* RO */
-#define INTEL_MSIC_RTCB3		0x142	/* RO */
-#define INTEL_MSIC_RTCB4		0x143	/* RO */
-#define INTEL_MSIC_RTCOB1		0x144
-#define INTEL_MSIC_RTCOB2		0x145
-#define INTEL_MSIC_RTCOB3		0x146
-#define INTEL_MSIC_RTCOB4		0x147
-#define INTEL_MSIC_RTCAB1		0x148
-#define INTEL_MSIC_RTCAB2		0x149
-#define INTEL_MSIC_RTCAB3		0x14a
-#define INTEL_MSIC_RTCAB4		0x14b
-#define INTEL_MSIC_RTCWAB1		0x14c
-#define INTEL_MSIC_RTCWAB2		0x14d
-#define INTEL_MSIC_RTCWAB3		0x14e
-#define INTEL_MSIC_RTCWAB4		0x14f
-#define INTEL_MSIC_RTCSC1		0x150
-#define INTEL_MSIC_RTCSC2		0x151
-#define INTEL_MSIC_RTCSC3		0x152
-#define INTEL_MSIC_RTCSC4		0x153
-#define INTEL_MSIC_RTCSTATUS		0x154	/* RO */
-#define INTEL_MSIC_RTCCONFIG1		0x155
-#define INTEL_MSIC_RTCCONFIG2		0x156
-
-/* CHARGER */
-#define INTEL_MSIC_BDTIMER		0x180
-#define INTEL_MSIC_BATTRMV		0x181
-#define INTEL_MSIC_VBUSDET		0x182
-#define INTEL_MSIC_VBUSDET1		0x183
-#define INTEL_MSIC_ADPHVDET		0x184
-#define INTEL_MSIC_ADPLVDET		0x185
-#define INTEL_MSIC_ADPDETDBDM		0x186
-#define INTEL_MSIC_LOWBATTDET		0x187
-#define INTEL_MSIC_CHRCTRL		0x188
-#define INTEL_MSIC_CHRCVOLTAGE		0x189
-#define INTEL_MSIC_CHRCCURRENT		0x18a
-#define INTEL_MSIC_SPCHARGER		0x18b
-#define INTEL_MSIC_CHRTTIME		0x18c
-#define INTEL_MSIC_CHRCTRL1		0x18d
-#define INTEL_MSIC_PWRSRCLMT		0x18e
-#define INTEL_MSIC_CHRSTWDT		0x18f
-#define INTEL_MSIC_WDTWRITE		0x190	/* WO */
-#define INTEL_MSIC_CHRSAFELMT		0x191
-#define INTEL_MSIC_SPWRSRCINT		0x192	/* RO */
-#define INTEL_MSIC_SPWRSRCINT1		0x193	/* RO */
-#define INTEL_MSIC_CHRLEDPWM		0x194
-#define INTEL_MSIC_CHRLEDCTRL		0x195
-
-/* ADC */
-#define INTEL_MSIC_ADC1CNTL1		0x1c0
-#define INTEL_MSIC_ADC1CNTL2		0x1c1
-#define INTEL_MSIC_ADC1CNTL3		0x1c2
-#define INTEL_MSIC_ADC1OFFSETH		0x1c3	/* RO */
-#define INTEL_MSIC_ADC1OFFSETL		0x1c4	/* RO */
-#define INTEL_MSIC_ADC1ADDR0		0x1c5
-#define INTEL_MSIC_ADC1ADDR1		0x1c6
-#define INTEL_MSIC_ADC1ADDR2		0x1c7
-#define INTEL_MSIC_ADC1ADDR3		0x1c8
-#define INTEL_MSIC_ADC1ADDR4		0x1c9
-#define INTEL_MSIC_ADC1ADDR5		0x1ca
-#define INTEL_MSIC_ADC1ADDR6		0x1cb
-#define INTEL_MSIC_ADC1ADDR7		0x1cc
-#define INTEL_MSIC_ADC1ADDR8		0x1cd
-#define INTEL_MSIC_ADC1ADDR9		0x1ce
-#define INTEL_MSIC_ADC1ADDR10		0x1cf
-#define INTEL_MSIC_ADC1ADDR11		0x1d0
-#define INTEL_MSIC_ADC1ADDR12		0x1d1
-#define INTEL_MSIC_ADC1ADDR13		0x1d2
-#define INTEL_MSIC_ADC1ADDR14		0x1d3
-#define INTEL_MSIC_ADC1SNS0H		0x1d4	/* RO */
-#define INTEL_MSIC_ADC1SNS0L		0x1d5	/* RO */
-#define INTEL_MSIC_ADC1SNS1H		0x1d6	/* RO */
-#define INTEL_MSIC_ADC1SNS1L		0x1d7	/* RO */
-#define INTEL_MSIC_ADC1SNS2H		0x1d8	/* RO */
-#define INTEL_MSIC_ADC1SNS2L		0x1d9	/* RO */
-#define INTEL_MSIC_ADC1SNS3H		0x1da	/* RO */
-#define INTEL_MSIC_ADC1SNS3L		0x1db	/* RO */
-#define INTEL_MSIC_ADC1SNS4H		0x1dc	/* RO */
-#define INTEL_MSIC_ADC1SNS4L		0x1dd	/* RO */
-#define INTEL_MSIC_ADC1SNS5H		0x1de	/* RO */
-#define INTEL_MSIC_ADC1SNS5L		0x1df	/* RO */
-#define INTEL_MSIC_ADC1SNS6H		0x1e0	/* RO */
-#define INTEL_MSIC_ADC1SNS6L		0x1e1	/* RO */
-#define INTEL_MSIC_ADC1SNS7H		0x1e2	/* RO */
-#define INTEL_MSIC_ADC1SNS7L		0x1e3	/* RO */
-#define INTEL_MSIC_ADC1SNS8H		0x1e4	/* RO */
-#define INTEL_MSIC_ADC1SNS8L		0x1e5	/* RO */
-#define INTEL_MSIC_ADC1SNS9H		0x1e6	/* RO */
-#define INTEL_MSIC_ADC1SNS9L		0x1e7	/* RO */
-#define INTEL_MSIC_ADC1SNS10H		0x1e8	/* RO */
-#define INTEL_MSIC_ADC1SNS10L		0x1e9	/* RO */
-#define INTEL_MSIC_ADC1SNS11H		0x1ea	/* RO */
-#define INTEL_MSIC_ADC1SNS11L		0x1eb	/* RO */
-#define INTEL_MSIC_ADC1SNS12H		0x1ec	/* RO */
-#define INTEL_MSIC_ADC1SNS12L		0x1ed	/* RO */
-#define INTEL_MSIC_ADC1SNS13H		0x1ee	/* RO */
-#define INTEL_MSIC_ADC1SNS13L		0x1ef	/* RO */
-#define INTEL_MSIC_ADC1SNS14H		0x1f0	/* RO */
-#define INTEL_MSIC_ADC1SNS14L		0x1f1	/* RO */
-#define INTEL_MSIC_ADC1BV0H		0x1f2	/* RO */
-#define INTEL_MSIC_ADC1BV0L		0x1f3	/* RO */
-#define INTEL_MSIC_ADC1BV1H		0x1f4	/* RO */
-#define INTEL_MSIC_ADC1BV1L		0x1f5	/* RO */
-#define INTEL_MSIC_ADC1BV2H		0x1f6	/* RO */
-#define INTEL_MSIC_ADC1BV2L		0x1f7	/* RO */
-#define INTEL_MSIC_ADC1BV3H		0x1f8	/* RO */
-#define INTEL_MSIC_ADC1BV3L		0x1f9	/* RO */
-#define INTEL_MSIC_ADC1BI0H		0x1fa	/* RO */
-#define INTEL_MSIC_ADC1BI0L		0x1fb	/* RO */
-#define INTEL_MSIC_ADC1BI1H		0x1fc	/* RO */
-#define INTEL_MSIC_ADC1BI1L		0x1fd	/* RO */
-#define INTEL_MSIC_ADC1BI2H		0x1fe	/* RO */
-#define INTEL_MSIC_ADC1BI2L		0x1ff	/* RO */
-#define INTEL_MSIC_ADC1BI3H		0x200	/* RO */
-#define INTEL_MSIC_ADC1BI3L		0x201	/* RO */
-#define INTEL_MSIC_CCCNTL		0x202
-#define INTEL_MSIC_CCOFFSETH		0x203	/* RO */
-#define INTEL_MSIC_CCOFFSETL		0x204	/* RO */
-#define INTEL_MSIC_CCADCHA		0x205	/* RO */
-#define INTEL_MSIC_CCADCLA		0x206	/* RO */
-
-/* AUDIO */
-#define INTEL_MSIC_AUDPLLCTRL		0x240
-#define INTEL_MSIC_DMICBUF0123		0x241
-#define INTEL_MSIC_DMICBUF45		0x242
-#define INTEL_MSIC_DMICGPO		0x244
-#define INTEL_MSIC_DMICMUX		0x245
-#define INTEL_MSIC_DMICCLK		0x246
-#define INTEL_MSIC_MICBIAS		0x247
-#define INTEL_MSIC_ADCCONFIG		0x248
-#define INTEL_MSIC_MICAMP1		0x249
-#define INTEL_MSIC_MICAMP2		0x24a
-#define INTEL_MSIC_NOISEMUX		0x24b
-#define INTEL_MSIC_AUDIOMUX12		0x24c
-#define INTEL_MSIC_AUDIOMUX34		0x24d
-#define INTEL_MSIC_AUDIOSINC		0x24e
-#define INTEL_MSIC_AUDIOTXEN		0x24f
-#define INTEL_MSIC_HSEPRXCTRL		0x250
-#define INTEL_MSIC_IHFRXCTRL		0x251
-#define INTEL_MSIC_VOICETXVOL		0x252
-#define INTEL_MSIC_SIDETONEVOL		0x253
-#define INTEL_MSIC_MUSICSHARVOL		0x254
-#define INTEL_MSIC_VOICETXCTRL		0x255
-#define INTEL_MSIC_HSMIXER		0x256
-#define INTEL_MSIC_DACCONFIG		0x257
-#define INTEL_MSIC_SOFTMUTE		0x258
-#define INTEL_MSIC_HSLVOLCTRL		0x259
-#define INTEL_MSIC_HSRVOLCTRL		0x25a
-#define INTEL_MSIC_IHFLVOLCTRL		0x25b
-#define INTEL_MSIC_IHFRVOLCTRL		0x25c
-#define INTEL_MSIC_DRIVEREN		0x25d
-#define INTEL_MSIC_LINEOUTCTRL		0x25e
-#define INTEL_MSIC_VIB1CTRL1		0x25f
-#define INTEL_MSIC_VIB1CTRL2		0x260
-#define INTEL_MSIC_VIB1CTRL3		0x261
-#define INTEL_MSIC_VIB1SPIPCM_1		0x262
-#define INTEL_MSIC_VIB1SPIPCM_2		0x263
-#define INTEL_MSIC_VIB1CTRL5		0x264
-#define INTEL_MSIC_VIB2CTRL1		0x265
-#define INTEL_MSIC_VIB2CTRL2		0x266
-#define INTEL_MSIC_VIB2CTRL3		0x267
-#define INTEL_MSIC_VIB2SPIPCM_1		0x268
-#define INTEL_MSIC_VIB2SPIPCM_2		0x269
-#define INTEL_MSIC_VIB2CTRL5		0x26a
-#define INTEL_MSIC_BTNCTRL1		0x26b
-#define INTEL_MSIC_BTNCTRL2		0x26c
-#define INTEL_MSIC_PCM1TXSLOT01		0x26d
-#define INTEL_MSIC_PCM1TXSLOT23		0x26e
-#define INTEL_MSIC_PCM1TXSLOT45		0x26f
-#define INTEL_MSIC_PCM1RXSLOT0123	0x270
-#define INTEL_MSIC_PCM1RXSLOT045	0x271
-#define INTEL_MSIC_PCM2TXSLOT01		0x272
-#define INTEL_MSIC_PCM2TXSLOT23		0x273
-#define INTEL_MSIC_PCM2TXSLOT45		0x274
-#define INTEL_MSIC_PCM2RXSLOT01		0x275
-#define INTEL_MSIC_PCM2RXSLOT23		0x276
-#define INTEL_MSIC_PCM2RXSLOT45		0x277
-#define INTEL_MSIC_PCM1CTRL1		0x278
-#define INTEL_MSIC_PCM1CTRL2		0x279
-#define INTEL_MSIC_PCM1CTRL3		0x27a
-#define INTEL_MSIC_PCM2CTRL1		0x27b
-#define INTEL_MSIC_PCM2CTRL2		0x27c
-
-/* HDMI */
-#define INTEL_MSIC_HDMIPUEN		0x280
-#define INTEL_MSIC_HDMISTATUS		0x281	/* RO */
-
-/* Physical address of the start of the MSIC interrupt tree in SRAM */
-#define INTEL_MSIC_IRQ_PHYS_BASE	0xffff7fc0
-
-/**
- * struct intel_msic_gpio_pdata - platform data for the MSIC GPIO driver
- * @gpio_base: base number for the GPIOs
- */
-struct intel_msic_gpio_pdata {
-	unsigned	gpio_base;
-};
-
-/**
- * struct intel_msic_ocd_pdata - platform data for the MSIC OCD driver
- * @gpio: GPIO number used for OCD interrupts
- *
- * The MSIC MFD driver converts @gpio into an IRQ number and passes it to
- * the OCD driver as %IORESOURCE_IRQ.
- */
-struct intel_msic_ocd_pdata {
-	unsigned	gpio;
-};
-
-/* MSIC embedded blocks (subdevices) */
-enum intel_msic_block {
-	INTEL_MSIC_BLOCK_TOUCH,
-	INTEL_MSIC_BLOCK_ADC,
-	INTEL_MSIC_BLOCK_BATTERY,
-	INTEL_MSIC_BLOCK_GPIO,
-	INTEL_MSIC_BLOCK_AUDIO,
-	INTEL_MSIC_BLOCK_HDMI,
-	INTEL_MSIC_BLOCK_THERMAL,
-	INTEL_MSIC_BLOCK_POWER_BTN,
-	INTEL_MSIC_BLOCK_OCD,
-
-	INTEL_MSIC_BLOCK_LAST,
-};
-
-/**
- * struct intel_msic_platform_data - platform data for the MSIC driver
- * @irq: array of interrupt numbers, one per device. If @irq is set to %0
- *	 for a given block, the corresponding platform device is not
- *	 created. For devices which don't have an interrupt, use %0xff
- *	 (this is same as in SFI spec).
- * @gpio: platform data for the MSIC GPIO driver
- * @ocd: platform data for the MSIC OCD driver
- *
- * Once the MSIC driver is initialized, the register interface is ready to
- * use. All the platform devices for subdevices are created after the
- * register interface is ready so that we can guarantee its availability to
- * the subdevice drivers.
- *
- * Interrupt numbers are passed to the subdevices via %IORESOURCE_IRQ
- * resources of the created platform device.
- */
-struct intel_msic_platform_data {
-	int				irq[INTEL_MSIC_BLOCK_LAST];
-	struct intel_msic_gpio_pdata	*gpio;
-	struct intel_msic_ocd_pdata	*ocd;
-};
-
-struct intel_msic;
-
-extern int intel_msic_reg_read(unsigned short reg, u8 *val);
-extern int intel_msic_reg_write(unsigned short reg, u8 val);
-extern int intel_msic_reg_update(unsigned short reg, u8 val, u8 mask);
-extern int intel_msic_bulk_read(unsigned short *reg, u8 *buf, size_t count);
-extern int intel_msic_bulk_write(unsigned short *reg, u8 *buf, size_t count);
-
-/*
- * pdev_to_intel_msic - gets an MSIC instance from the platform device
- * @pdev: platform device pointer
- *
- * The client drivers need to have pointer to the MSIC instance if they
- * want to call intel_msic_irq_read(). This macro can be used for
- * convenience to get the MSIC pointer from @pdev where needed. This is
- * _only_ valid for devices which are managed by the MSIC.
- */
-#define pdev_to_intel_msic(pdev)	(dev_get_drvdata(pdev->dev.parent))
-
-extern int intel_msic_irq_read(struct intel_msic *msic, unsigned short reg,
-			       u8 *val);
-
-#endif /* __LINUX_MFD_INTEL_MSIC_H__ */
-- 
cgit v1.2.3


From 81d88ce55092edf1a1f928efb373f289c6b90efd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 16:38:40 +0100
Subject: dma-mapping: remove the {alloc,free}_noncoherent methods

It turns out allowing non-contigous allocations here was a rather bad
idea, as we'll now need to define ways to get the pages for mmaping
or dma_buf sharing.  Revert this change and stick to the original
concept.  A different API for the use case of non-contigous allocations
will be added back later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Tested-by: Ricardo Ribalda <ribalda@chromium.org>:wq
---
 Documentation/core-api/dma-api.rst | 64 +++++++++++++-------------------------
 drivers/iommu/dma-iommu.c          | 30 ------------------
 include/linux/dma-map-ops.h        |  5 ---
 include/linux/dma-mapping.h        | 17 +++++++---
 kernel/dma/mapping.c               | 40 ------------------------
 5 files changed, 35 insertions(+), 121 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 75cb757bbff0..e6d23f117308 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -528,16 +528,14 @@ an I/O device, you should not be using this part of the API.
 
 ::
 
-	void *
-	dma_alloc_noncoherent(struct device *dev, size_t size,
-			dma_addr_t *dma_handle, enum dma_data_direction dir,
-			gfp_t gfp)
+	struct page *
+	dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle,
+			enum dma_data_direction dir, gfp_t gfp)
 
-This routine allocates a region of <size> bytes of consistent memory.  It
-returns a pointer to the allocated region (in the processor's virtual address
-space) or NULL if the allocation failed.  The returned memory may or may not
-be in the kernel direct mapping.  Drivers must not call virt_to_page on
-the returned memory region.
+This routine allocates a region of <size> bytes of non-coherent memory.  It
+returns a pointer to first struct page for the region, or NULL if the
+allocation failed. The resulting struct page can be used for everything a
+struct page is suitable for.
 
 It also returns a <dma_handle> which may be cast to an unsigned integer the
 same width as the bus and given to the device as the DMA address base of
@@ -558,51 +556,33 @@ reused.
 ::
 
 	void
-	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
+	dma_free_pages(struct device *dev, size_t size, struct page *page,
 			dma_addr_t dma_handle, enum dma_data_direction dir)
 
-Free a region of memory previously allocated using dma_alloc_noncoherent().
-dev, size and dma_handle and dir must all be the same as those passed into
-dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
-dma_alloc_noncoherent().
+Free a region of memory previously allocated using dma_alloc_pages().
+dev, size, dma_handle and dir must all be the same as those passed into
+dma_alloc_pages().  page must be the pointer returned by dma_alloc_pages().
 
 ::
 
-	struct page *
-	dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle,
-			enum dma_data_direction dir, gfp_t gfp)
-
-This routine allocates a region of <size> bytes of non-coherent memory.  It
-returns a pointer to first struct page for the region, or NULL if the
-allocation failed. The resulting struct page can be used for everything a
-struct page is suitable for.
-
-It also returns a <dma_handle> which may be cast to an unsigned integer the
-same width as the bus and given to the device as the DMA address base of
-the region.
-
-The dir parameter specified if data is read and/or written by the device,
-see dma_map_single() for details.
-
-The gfp parameter allows the caller to specify the ``GFP_`` flags (see
-kmalloc()) for the allocation, but rejects flags used to specify a memory
-zone such as GFP_DMA or GFP_HIGHMEM.
+	void *
+	dma_alloc_noncoherent(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, enum dma_data_direction dir,
+			gfp_t gfp)
 
-Before giving the memory to the device, dma_sync_single_for_device() needs
-to be called, and before reading memory written by the device,
-dma_sync_single_for_cpu(), just like for streaming DMA mappings that are
-reused.
+This routine is a convenient wrapper around dma_alloc_pages that returns the
+kernel virtual address for the allocated memory instead of the page structure.
 
 ::
 
 	void
-	dma_free_pages(struct device *dev, size_t size, struct page *page,
+	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
 			dma_addr_t dma_handle, enum dma_data_direction dir)
 
-Free a region of memory previously allocated using dma_alloc_pages().
-dev, size and dma_handle and dir must all be the same as those passed into
-dma_alloc_noncoherent().  page must be the pointer returned by
-dma_alloc_pages().
+Free a region of memory previously allocated using dma_alloc_noncoherent().
+dev, size, dma_handle and dir must all be the same as those passed into
+dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
+dma_alloc_noncoherent().
 
 ::
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4078358ed66e..255533faf905 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1197,34 +1197,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 	return cpu_addr;
 }
 
-#ifdef CONFIG_DMA_REMAP
-static void *iommu_dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *handle, enum dma_data_direction dir, gfp_t gfp)
-{
-	if (!gfpflags_allow_blocking(gfp)) {
-		struct page *page;
-
-		page = dma_common_alloc_pages(dev, size, handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
-
-	return iommu_dma_alloc_remap(dev, size, handle, gfp | __GFP_ZERO,
-				     PAGE_KERNEL, 0);
-}
-
-static void iommu_dma_free_noncoherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t handle, enum dma_data_direction dir)
-{
-	__iommu_dma_unmap(dev, handle, size);
-	__iommu_dma_free(dev, size, cpu_addr);
-}
-#else
-#define iommu_dma_alloc_noncoherent		NULL
-#define iommu_dma_free_noncoherent		NULL
-#endif /* CONFIG_DMA_REMAP */
-
 static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs)
@@ -1295,8 +1267,6 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.free			= iommu_dma_free,
 	.alloc_pages		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
-	.alloc_noncoherent	= iommu_dma_alloc_noncoherent,
-	.free_noncoherent	= iommu_dma_free_noncoherent,
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 70fcd0f610ea..11e02537b9e0 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -22,11 +22,6 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
-	void *(*alloc_noncoherent)(struct device *dev, size_t size,
-			dma_addr_t *dma_handle, enum dma_data_direction dir,
-			gfp_t gfp);
-	void (*free_noncoherent)(struct device *dev, size_t size, void *vaddr,
-			dma_addr_t dma_handle, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			void *, dma_addr_t, size_t, unsigned long attrs);
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2e49996a8f39..fbfa3f5abd94 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -263,10 +263,19 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_pages(struct device *dev, size_t size, struct page *page,
 		dma_addr_t dma_handle, enum dma_data_direction dir);
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, enum dma_data_direction dir);
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+	return page ? page_address(page) : NULL;
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+		void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
+}
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f87a89d08654..68992e35c8c3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -515,46 +515,6 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
 }
 EXPORT_SYMBOL_GPL(dma_free_pages);
 
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	void *vaddr;
-
-	if (!ops || !ops->alloc_noncoherent) {
-		struct page *page;
-
-		page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
-
-	size = PAGE_ALIGN(size);
-	vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
-	if (vaddr)
-		debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
-				   *dma_handle);
-	return vaddr;
-}
-EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
-
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!ops || !ops->free_noncoherent) {
-		dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
-		return;
-	}
-
-	size = PAGE_ALIGN(size);
-	debug_dma_unmap_page(dev, dma_handle, size, dir);
-	ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
-}
-EXPORT_SYMBOL_GPL(dma_free_noncoherent);
-
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 8f1fc1c15329a9d53bde5636e85ca98ece2ec7bd Mon Sep 17 00:00:00 2001
From: Martin Hundebøll <mhu@silicom.dk>
Date: Mon, 8 Feb 2021 16:01:57 +0100
Subject: PCI: Add Silicom Denmark vendor ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update pci_ids.h with the vendor ID for Silicom Denmark. The define is
going to be referenced in driver(s) for FPGA accelerated smart NICs.

Link: https://lore.kernel.org/r/20210208150158.2877414-1-mhu@silicom.dk
Signed-off-by: Martin Hundebøll <mhu@silicom.dk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..aae07d512ca6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2588,6 +2588,8 @@
 
 #define PCI_VENDOR_ID_REDHAT		0x1b36
 
+#define PCI_VENDOR_ID_SILICOM_DENMARK	0x1c2c
+
 #define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS	0x1c36
 
 #define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
-- 
cgit v1.2.3


From 1852ebd1354201cf4ab9564947ff2a17a918b294 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 10 Feb 2021 21:27:56 +1100
Subject: of: irq: make a stub for of_irq_parse_one()

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20210210214720.02e6a6be@canb.auug.org.au
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_irq.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index e8b78139f78c..f898d838d201 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -33,8 +33,6 @@ static inline int of_irq_parse_oldworld(struct device_node *device, int index,
 #endif /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
 
 extern int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq);
-extern int of_irq_parse_one(struct device_node *device, int index,
-			  struct of_phandle_args *out_irq);
 extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data);
 extern int of_irq_to_resource(struct device_node *dev, int index,
 			      struct resource *r);
@@ -42,6 +40,8 @@ extern int of_irq_to_resource(struct device_node *dev, int index,
 extern void of_irq_init(const struct of_device_id *matches);
 
 #ifdef CONFIG_OF_IRQ
+extern int of_irq_parse_one(struct device_node *device, int index,
+			  struct of_phandle_args *out_irq);
 extern int of_irq_count(struct device_node *dev);
 extern int of_irq_get(struct device_node *dev, int index);
 extern int of_irq_get_byname(struct device_node *dev, const char *name);
@@ -57,6 +57,11 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
+static inline int of_irq_parse_one(struct device_node *device, int index,
+				   struct of_phandle_args *out_irq)
+{
+	return 0;
+}
 static inline int of_irq_count(struct device_node *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From cd1a41ceba8a4caef4d18a3a14d6d0f8c656efe4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Feb 2021 00:40:52 +0100
Subject: softirq: Move __ARCH_HAS_DO_SOFTIRQ to Kconfig

To prepare for inlining do_softirq_own_stack() replace
__ARCH_HAS_DO_SOFTIRQ with a Kconfig switch and select it in the affected
architectures.

This allows in the next step to move the function prototype and the inline
stub into a seperate asm-generic header file which is required to avoid
include recursion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210210002513.181713427@linutronix.de
---
 arch/Kconfig                      | 6 ++++++
 arch/parisc/Kconfig               | 1 +
 arch/parisc/include/asm/hardirq.h | 4 ----
 arch/powerpc/Kconfig              | 1 +
 arch/powerpc/include/asm/irq.h    | 2 --
 arch/s390/Kconfig                 | 1 +
 arch/s390/include/asm/hardirq.h   | 1 -
 arch/sh/Kconfig                   | 1 +
 arch/sh/include/asm/irq.h         | 1 -
 arch/sparc/Kconfig                | 1 +
 arch/sparc/include/asm/irq_64.h   | 1 -
 arch/x86/Kconfig                  | 1 +
 arch/x86/include/asm/irq.h        | 2 --
 include/linux/interrupt.h         | 2 +-
 14 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 24862d15f3a3..7eab9d0a4b43 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -759,6 +759,12 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
 	  This spares a stack switch and improves cache usage on softirq
 	  processing.
 
+config HAVE_SOFTIRQ_ON_OWN_STACK
+	bool
+	help
+	  Architecture provides a function to run __do_softirq() on a
+	  seperate stack.
+
 config PGTABLE_LEVELS
 	int
 	default 2
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 78b17621ee4a..7cd88c0a9a32 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -63,6 +63,7 @@ config PARISC
 	select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
 	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
 	select SET_FS
 
 	help
diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h
index fad29aa6f45f..1e4fbd0fd944 100644
--- a/arch/parisc/include/asm/hardirq.h
+++ b/arch/parisc/include/asm/hardirq.h
@@ -12,10 +12,6 @@
 #include <linux/threads.h>
 #include <linux/irq.h>
 
-#ifdef CONFIG_IRQSTACKS
-#define __ARCH_HAS_DO_SOFTIRQ
-#endif
-
 typedef struct {
 	unsigned int __softirq_pending;
 	unsigned int kernel_stack_usage;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..888d1ad436ce 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -237,6 +237,7 @@ config PPC
 	select MMU_GATHER_PAGE_SIZE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 4f983ca4030a..f3f264e441a7 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -37,8 +37,6 @@ extern int distribute_irqs;
 
 struct pt_regs;
 
-#define __ARCH_HAS_DO_SOFTIRQ
-
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 /*
  * Per-cpu stacks for handling critical, debug and machine check
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c72874f09741..e00c02ab5706 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -182,6 +182,7 @@ config S390
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE
 	select HAVE_RSEQ
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_VIRT_CPU_ACCOUNTING_IDLE
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c0674..58668ffb5488 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,6 @@
 #define or_softirq_pending(x)  (S390_lowcore.softirq_pending |= (x))
 
 #define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
 #define __ARCH_IRQ_EXIT_IRQS_DISABLED
 
 static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5fa580219a86..b49d5e033e29 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -56,6 +56,7 @@ config SUPERH
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_UID16
+	select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_FORCED_THREADING
diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h
index 6d44c32ef047..839551ce398c 100644
--- a/arch/sh/include/asm/irq.h
+++ b/arch/sh/include/asm/irq.h
@@ -51,7 +51,6 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs);
 #ifdef CONFIG_IRQSTACKS
 extern void irq_ctx_init(int cpu);
 extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
 #else
 # define irq_ctx_init(cpu) do { } while (0)
 # define irq_ctx_exit(cpu) do { } while (0)
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c9c34dc52b7d..f0d22d530645 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -97,6 +97,7 @@ config SPARC64
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
 	select ARCH_HAS_GIGANTIC_PAGE
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 4d748e93b974..154df2cf19f4 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -93,7 +93,6 @@ void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
 
 extern void *hardirq_stack[NR_CPUS];
 extern void *softirq_stack[NR_CPUS];
-#define __ARCH_HAS_DO_SOFTIRQ
 
 #define NO_IRQ		0xffffffff
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e17ce871bb19..019e0e1a5ee0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -221,6 +221,7 @@ config X86
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
 	select HAVE_FUNCTION_ARG_ACCESS_API
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_STATIC_CALL
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f..f70d2ca3887e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,8 +25,6 @@ static inline int irq_canonicalize(int irq)
 
 extern int irq_init_percpu_irqstack(unsigned int cpu);
 
-#define __ARCH_HAS_DO_SOFTIRQ
-
 struct irq_desc;
 
 extern void fixup_irqs(void);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index bb8ff9083e7d..f0b918f393a2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,7 +569,7 @@ struct softirq_action
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 
-#ifdef __ARCH_HAS_DO_SOFTIRQ
+#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void);
 #else
 static inline void do_softirq_own_stack(void)
-- 
cgit v1.2.3


From db1cc7aede37eb9235759131ddfefd9c0ea5136f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Feb 2021 00:40:53 +0100
Subject: softirq: Move do_softirq_own_stack() to generic asm header

To avoid include recursion hell move the do_softirq_own_stack() related
content into a generic asm header and include it from all places in arch/
which need the prototype.

This allows architectures to provide an inline implementation of
do_softirq_own_stack() without introducing a lot of #ifdeffery all over the
place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210210002513.289960691@linutronix.de
---
 arch/parisc/kernel/irq.c            |  1 +
 arch/powerpc/kernel/irq.c           |  1 +
 arch/s390/kernel/irq.c              |  1 +
 arch/sh/kernel/irq.c                |  1 +
 arch/sparc/kernel/irq_64.c          |  1 +
 arch/x86/kernel/irq_32.c            |  1 +
 arch/x86/kernel/irq_64.c            |  1 +
 include/asm-generic/Kbuild          |  1 +
 include/asm-generic/softirq_stack.h | 14 ++++++++++++++
 include/linux/interrupt.h           |  9 ---------
 kernel/softirq.c                    |  2 ++
 11 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 include/asm-generic/softirq_stack.h

(limited to 'include')

diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 49cd6d2caefb..1632d525ad64 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <asm/io.h>
 
+#include <asm/softirq_stack.h>
 #include <asm/smp.h>
 #include <asm/ldcw.h>
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6b1eca53e36c..96296d3383a7 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,6 +65,7 @@
 #include <asm/livepatch.h>
 #include <asm/asm-prototypes.h>
 #include <asm/hw_irq.h>
+#include <asm/softirq_stack.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index f8a8b9428ae2..a1a2f75f3c09 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -27,6 +27,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
 #include "entry.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ab5f790b0cd2..ef0f0827cf57 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <asm/thread_info.h>
 #include <cpu/mmu_context.h>
+#include <asm/softirq_stack.h>
 
 atomic_t irq_err_count;
 
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 3ec9f1402aad..c8848bb681a1 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -42,6 +42,7 @@
 #include <asm/head.h>
 #include <asm/hypervisor.h>
 #include <asm/cacheflush.h>
+#include <asm/softirq_stack.h>
 
 #include "entry.h"
 #include "cpumap.h"
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 0b79efc87be5..044902d5a3c4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -22,6 +22,7 @@
 
 #include <asm/apic.h>
 #include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index b88fdb9686e6..f335c3910834 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task_stack.h>
 
 #include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
 #include <asm/irq_stack.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 267f6dfb8960..bd684188e2dc 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -51,6 +51,7 @@ mandatory-y += sections.h
 mandatory-y += serial.h
 mandatory-y += shmparam.h
 mandatory-y += simd.h
+mandatory-y += softirq_stack.h
 mandatory-y += switch_to.h
 mandatory-y += timex.h
 mandatory-y += tlbflush.h
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
new file mode 100644
index 000000000000..eceeecf6a5bd
--- /dev/null
+++ b/include/asm-generic/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
+#define __ASM_GENERIC_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
+void do_softirq_own_stack(void);
+#else
+static inline void do_softirq_own_stack(void)
+{
+	__do_softirq();
+}
+#endif
+
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f0b918f393a2..967e25767153 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,15 +569,6 @@ struct softirq_action
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 
-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
-void do_softirq_own_stack(void);
-#else
-static inline void do_softirq_own_stack(void)
-{
-	__do_softirq();
-}
-#endif
-
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9d71046ea247..9908ec4a9bfe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,6 +26,8 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 
+#include <asm/softirq_stack.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
-- 
cgit v1.2.3


From 8c0381f55bbf70a3b8ab24e4f5ac62125c44c804 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 10 Feb 2021 12:00:49 -0800
Subject: of: irq: Fix the return value for of_irq_parse_one() stub

When commit 1852ebd13542 ("of: irq: make a stub for of_irq_parse_one()")
added a stub for of_irq_parse_one() it set the return value to 0. Return
value of 0 in this instance means the call succeeded and the out_irq
pointer was filled with valid data. So, fix it to return an error value.

Fixes: 1852ebd13542 ("of: irq: make a stub for of_irq_parse_one()")
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210210200050.4106032-1-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index f898d838d201..aaf219bd0354 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -60,7 +60,7 @@ u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 static inline int of_irq_parse_one(struct device_node *device, int index,
 				   struct of_phandle_args *out_irq)
 {
-	return 0;
+	return -EINVAL;
 }
 static inline int of_irq_count(struct device_node *dev)
 {
-- 
cgit v1.2.3


From 53abf3fe831756261f399dad03ccc07235296acf Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Thu, 11 Feb 2021 10:20:36 -0700
Subject: coresight: etm-perf: Clarify comment on perf options

In theory, the options should be arbitrary values and are neutral for
any ETM version; so far perf tool uses ETMv3.5/PTM ETMCR config bits
except for register's bit definitions, also uses as options.

This can introduce confusion, especially if we want to add a new option
but the new option is not supported by ETMv3.5/PTM ETMCR.  But on the
other hand, we cannot change options since these options are generic
CoreSight PMU ABI.

For easier maintenance and avoid confusion, this patch refines the
comment to clarify perf options, and gives out the background info for
these bits are coming from ETMv3.5/PTM.  Afterwards, we should take
these options as general knobs, and if there have any confliction with
ETMv3.5/PTM, should consider to define saperate macros for ETMv3.5/PTM
ETMCR config bits.

Suggested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210206150833.42120-2-leo.yan@linaro.org
Link: https://lore.kernel.org/r/20210211172038.2483517-2-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm-perf.c |  5 ++++-
 include/linux/coresight-pmu.h                    | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index bdc34ca449f7..465ef1aa8c82 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -27,7 +27,10 @@ static bool etm_perf_up;
 static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
-/* ETMv3.5/PTM's ETMCR is 'config' */
+/*
+ * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
+ * now take them as general formats and apply on all ETMs.
+ */
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
 PMU_FORMAT_ATTR(contextid,	"config:" __stringify(ETM_OPT_CTXTID));
 PMU_FORMAT_ATTR(timestamp,	"config:" __stringify(ETM_OPT_TS));
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID	14
-#define ETM_OPT_TS      28
-#define ETM_OPT_RETSTK	29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC		12
+#define ETM_OPT_CTXTID		14
+#define ETM_OPT_TS		28
+#define ETM_OPT_RETSTK		29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC	4
-- 
cgit v1.2.3


From 88f11864cf1d1324f620059ec747d74b72d9d736 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 11 Feb 2021 10:20:37 -0700
Subject: coresight: etm-perf: Support PID tracing for kernel at EL2

When the kernel is running at EL2, the PID is stored in CONTEXTIDR_EL2.
So, tracing CONTEXTIDR_EL1 doesn't give us the pid of the process.
Thus we should trace the VMID with VMIDOPT set to trace CONTEXTIDR_EL2
instead of CONTEXTIDR_EL1.  Given that we have an existing config
option "contextid" and this will be useful for tracing virtual machines
(when we get to support virtualization).

So instead, this patch extends option CTXTID with an extra bit
ETM_OPT_CTXTID2 (bit 15), thus on an EL2 kernel, we will have another
bit available for the perf tool: ETM_OPT_CTXTID is for kernel running in
EL1, ETM_OPT_CTXTID2 is used when kernel runs in EL2 with VHE enabled.

The tool must be backward compatible for users, i.e, "contextid" today
traces PID and that should remain the same; for this purpose, the perf
tool is updated to automatically set corresponding bit for the
"contextid" config, therefore, the user doesn't have to bother which EL
the kernel is running.

  i.e, perf record -e cs_etm/contextid/u --

will always do the "pid" tracing, independent of the kernel EL.

The driver parses the format "contextid", which traces CONTEXTIDR_EL1
for ETM_OPT_CTXTID (on EL1 kernel) and traces CONTEXTIDR_EL2 for
ETM_OPT_CTXTID2 (on EL2 kernel).

Besides the enhancement for format "contexid", extra two formats are
introduced: "contextid1" and "contextid2".  This considers to support
tracing both CONTEXTIDR_EL1 and CONTEXTIDR_EL2 when the kernel is
running at EL2.  Finally, the PMU formats are defined as follow:

  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
                kernel is running at EL1, "contextid1" enables the PID
		tracing; when the kernel is running at EL2, this enables
		tracing the PID of guest applications.

  "contextid2": Only usable when the kernel is running at EL2.  When
                selected, enables PID tracing on EL2 kernel.

  "contextid":  Will be an alias for the option that enables PID
                tracing.  I.e,
                contextid == contextid1, on EL1 kernel.
                contextid == contextid2, on EL2 kernel.

Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Al Grant <al.grant@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
[ Added two config formats: contextid1, contextid2 ]
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210206150833.42120-4-leo.yan@linaro.org
Link: https://lore.kernel.org/r/20210211172038.2483517-3-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm-perf.c   | 27 +++++++++++++++++++++-
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 13 +++++++++++
 include/linux/coresight-pmu.h                      |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 465ef1aa8c82..0f603b4094f2 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -32,15 +32,40 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  * now take them as general formats and apply on all ETMs.
  */
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
-PMU_FORMAT_ATTR(contextid,	"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid1 enables tracing CONTEXTIDR_EL1 for ETMv4 */
+PMU_FORMAT_ATTR(contextid1,	"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid2 enables tracing CONTEXTIDR_EL2 for ETMv4 */
+PMU_FORMAT_ATTR(contextid2,	"config:" __stringify(ETM_OPT_CTXTID2));
 PMU_FORMAT_ATTR(timestamp,	"config:" __stringify(ETM_OPT_TS));
 PMU_FORMAT_ATTR(retstack,	"config:" __stringify(ETM_OPT_RETSTK));
 /* Sink ID - same for all ETMs */
 PMU_FORMAT_ATTR(sinkid,		"config2:0-31");
 
+/*
+ * contextid always traces the "PID".  The PID is in CONTEXTIDR_EL1
+ * when the kernel is running at EL1; when the kernel is at EL2,
+ * the PID is in CONTEXTIDR_EL2.
+ */
+static ssize_t format_attr_contextid_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *page)
+{
+	int pid_fmt = ETM_OPT_CTXTID;
+
+#if defined(CONFIG_CORESIGHT_SOURCE_ETM4X)
+	pid_fmt = is_kernel_in_hyp_mode() ? ETM_OPT_CTXTID2 : ETM_OPT_CTXTID;
+#endif
+	return sprintf(page, "config:%d\n", pid_fmt);
+}
+
+struct device_attribute format_attr_contextid =
+	__ATTR(contextid, 0444, format_attr_contextid_show, NULL);
+
 static struct attribute *etm_config_formats_attr[] = {
 	&format_attr_cycacc.attr,
 	&format_attr_contextid.attr,
+	&format_attr_contextid1.attr,
+	&format_attr_contextid2.attr,
 	&format_attr_timestamp.attr,
 	&format_attr_retstack.attr,
 	&format_attr_sinkid.attr,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index c8ecd91e289e..15016f757828 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -550,6 +550,19 @@ static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
 		/* bit[6], Context ID tracing bit */
 		config->cfg |= BIT(ETM4_CFG_BIT_CTXTID);
 
+	/*
+	 * If set bit ETM_OPT_CTXTID2 in perf config, this asks to trace VMID
+	 * for recording CONTEXTIDR_EL2.  Do not enable VMID tracing if the
+	 * kernel is not running in EL2.
+	 */
+	if (attr->config & BIT(ETM_OPT_CTXTID2)) {
+		if (!is_kernel_in_hyp_mode()) {
+			ret = -EINVAL;
+			goto out;
+		}
+		config->cfg |= BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT);
+	}
+
 	/* return stack - enable if selected and supported */
 	if ((attr->config & BIT(ETM_OPT_RETSTK)) && drvdata->retstack)
 		/* bit[12], Return stack enable bit */
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 5dc47cfdcf07..4ac5c081af93 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -20,14 +20,17 @@
  */
 #define ETM_OPT_CYCACC		12
 #define ETM_OPT_CTXTID		14
+#define ETM_OPT_CTXTID2		15
 #define ETM_OPT_TS		28
 #define ETM_OPT_RETSTK		29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC	4
 #define ETM4_CFG_BIT_CTXTID	6
+#define ETM4_CFG_BIT_VMID	7
 #define ETM4_CFG_BIT_TS		11
 #define ETM4_CFG_BIT_RETSTK	12
+#define ETM4_CFG_BIT_VMID_OPT	15
 
 static inline int coresight_get_trace_id(int cpu)
 {
-- 
cgit v1.2.3


From bb90d4bc7b6a536b2e4db45f4763e467c2008251 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:14 -0800
Subject: mm/highmem: Lift memcpy_[to|from]_page to core

Working through a conversion to a call kmap_local_page() instead of
kmap() revealed many places where the pattern kmap/memcpy/kunmap
occurred.

Eric Biggers, Matthew Wilcox, Christoph Hellwig, Dan Williams, and Al
Viro all suggested putting this code into helper functions.  Al Viro
further pointed out that these functions already existed in the iov_iter
code.[1]

Various locations for the lifted functions were considered.

Headers like mm.h or string.h seem ok but don't really portray the
functionality well.  pagemap.h made some sense but is for page cache
functionality.[2]

Another alternative would be to create a new header for the promoted
memcpy functions, but it masks the fact that these are designed to copy
to/from pages using the kernel direct mappings and complicates matters
with a new header.

Placing these functions in 'highmem.h' is suboptimal especially with the
changes being proposed in the functionality of kmap.  From a caller
perspective including/using 'highmem.h' implies that the functions
defined in that header are only required when highmem is in use which is
increasingly not the case with modern processors.  However, highmem.h is
where all the current functions like this reside (zero_user(),
clear_highpage(), clear_user_highpage(), copy_user_highpage(), and
copy_highpage()).  So it makes the most sense even though it is
distasteful for some.[3]

Lift memcpy_to_page() and memcpy_from_page() to pagemap.h.

[1] https://lore.kernel.org/lkml/20201013200149.GI3576660@ZenIV.linux.org.uk/
    https://lore.kernel.org/lkml/20201013112544.GA5249@infradead.org/

[2] https://lore.kernel.org/lkml/20201208122316.GH7338@casper.infradead.org/

[3] https://lore.kernel.org/lkml/20201013200149.GI3576660@ZenIV.linux.org.uk/#t
    https://lore.kernel.org/lkml/20201208163814.GN1563847@iweiny-DESK2.sc.intel.com/

Cc: Boris Pismenny <borisp@mellanox.com>
Cc: Or Gerlitz <gerlitz.or@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 18 ++++++++++++++++++
 lib/iov_iter.c          | 14 --------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index d2c70d3772a3..736b6a9f144d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -276,4 +276,22 @@ static inline void copy_highpage(struct page *to, struct page *from)
 
 #endif
 
+static inline void memcpy_from_page(char *to, struct page *page,
+				    size_t offset, size_t len)
+{
+	char *from = kmap_atomic(page);
+
+	memcpy(to, from + offset, len);
+	kunmap_atomic(from);
+}
+
+static inline void memcpy_to_page(struct page *page, size_t offset,
+				  const char *from, size_t len)
+{
+	char *to = kmap_atomic(page);
+
+	memcpy(to + offset, from, len);
+	kunmap_atomic(to);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index a21e6a5792c5..9889e9903cdf 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -466,20 +466,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
-{
-	char *from = kmap_atomic(page);
-	memcpy(to, from + offset, len);
-	kunmap_atomic(from);
-}
-
-static void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
-{
-	char *to = kmap_atomic(page);
-	memcpy(to + offset, from, len);
-	kunmap_atomic(to);
-}
-
 static void memzero_page(struct page *page, size_t offset, size_t len)
 {
 	char *addr = kmap_atomic(page);
-- 
cgit v1.2.3


From 61b205f579911a11f0b576f73275eca2aed0d108 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:15 -0800
Subject: mm/highmem: Convert memcpy_[to|from]_page() to kmap_local_page()

kmap_local_page() is more efficient and is well suited for these calls.
Convert the kmap() to kmap_local_page()

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 736b6a9f144d..c17a175fe5fe 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -279,19 +279,19 @@ static inline void copy_highpage(struct page *to, struct page *from)
 static inline void memcpy_from_page(char *to, struct page *page,
 				    size_t offset, size_t len)
 {
-	char *from = kmap_atomic(page);
+	char *from = kmap_local_page(page);
 
 	memcpy(to, from + offset, len);
-	kunmap_atomic(from);
+	kunmap_local(from);
 }
 
 static inline void memcpy_to_page(struct page *page, size_t offset,
 				  const char *from, size_t len)
 {
-	char *to = kmap_atomic(page);
+	char *to = kmap_local_page(page);
 
 	memcpy(to + offset, from, len);
-	kunmap_atomic(to);
+	kunmap_local(to);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 6a0996db6879cf09f989c5f44f9edd38240cb346 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:16 -0800
Subject: mm/highmem: Introduce memcpy_page(), memmove_page(), and
 memset_page()

3 more common kmap patterns are kmap/memcpy/kunmap, kmap/memmove/kunmap.
and kmap/memset/kunmap.

Add helper functions for those patterns which use kmap_local_page().

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index c17a175fe5fe..0b5d89621cb9 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -276,6 +276,39 @@ static inline void copy_highpage(struct page *to, struct page *from)
 
 #endif
 
+static inline void memcpy_page(struct page *dst_page, size_t dst_off,
+			       struct page *src_page, size_t src_off,
+			       size_t len)
+{
+	char *dst = kmap_local_page(dst_page);
+	char *src = kmap_local_page(src_page);
+
+	memcpy(dst + dst_off, src + src_off, len);
+	kunmap_local(src);
+	kunmap_local(dst);
+}
+
+static inline void memmove_page(struct page *dst_page, size_t dst_off,
+			       struct page *src_page, size_t src_off,
+			       size_t len)
+{
+	char *dst = kmap_local_page(dst_page);
+	char *src = kmap_local_page(src_page);
+
+	memmove(dst + dst_off, src + src_off, len);
+	kunmap_local(src);
+	kunmap_local(dst);
+}
+
+static inline void memset_page(struct page *page, size_t offset, int val,
+			       size_t len)
+{
+	char *addr = kmap_local_page(page);
+
+	memset(addr + offset, val, len);
+	kunmap_local(addr);
+}
+
 static inline void memcpy_from_page(char *to, struct page *page,
 				    size_t offset, size_t len)
 {
-- 
cgit v1.2.3


From ca18f6ea012bf30236b76c3480ac2c97131b6f8f Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 10 Feb 2021 09:49:28 -0800
Subject: mm/highmem: Add VM_BUG_ON() to mem*_page() calls

Add VM_BUG_ON bounds checks to ensure the newly lifted and created page
memory operations do not result in corrupted data in neighbor pages.[1][2]

[1] https://lore.kernel.org/lkml/20201210053502.GS1563847@iweiny-DESK2.sc.intel.com/
[2] https://lore.kernel.org/lkml/20210209110931.00f00e47d9a0529fcee2ff01@linux-foundation.org/

Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0b5d89621cb9..44170f312ae7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -283,6 +283,7 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off,
 	char *dst = kmap_local_page(dst_page);
 	char *src = kmap_local_page(src_page);
 
+	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
 	memcpy(dst + dst_off, src + src_off, len);
 	kunmap_local(src);
 	kunmap_local(dst);
@@ -295,6 +296,7 @@ static inline void memmove_page(struct page *dst_page, size_t dst_off,
 	char *dst = kmap_local_page(dst_page);
 	char *src = kmap_local_page(src_page);
 
+	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
 	memmove(dst + dst_off, src + src_off, len);
 	kunmap_local(src);
 	kunmap_local(dst);
@@ -305,6 +307,7 @@ static inline void memset_page(struct page *page, size_t offset, int val,
 {
 	char *addr = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memset(addr + offset, val, len);
 	kunmap_local(addr);
 }
@@ -314,6 +317,7 @@ static inline void memcpy_from_page(char *to, struct page *page,
 {
 	char *from = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memcpy(to, from + offset, len);
 	kunmap_local(from);
 }
@@ -323,6 +327,7 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
 {
 	char *to = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memcpy(to + offset, from, len);
 	kunmap_local(to);
 }
-- 
cgit v1.2.3


From 258e0815e2b1706e87c0d874211097aa8a7aa52f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Sun, 14 Feb 2021 17:16:33 +0000
Subject: percpu: fix clang modpost section mismatch

pcpu_build_alloc_info() is an __init function that makes a call to
cpumask_clear_cpu(). With CONFIG_GCOV_PROFILE_ALL enabled, the inline
heuristics are modified and such cpumask_clear_cpu() which is marked
inline doesn't get inlined. Because it works on mask in __initdata,
modpost throws a section mismatch error.

Arnd sent a patch with the flatten attribute as an alternative [2]. I've
added it to compiler_attributes.h.

modpost complaint:
  WARNING: modpost: vmlinux.o(.text+0x735425): Section mismatch in reference from the function cpumask_clear_cpu() to the variable .init.data:pcpu_build_alloc_info.mask
  The function cpumask_clear_cpu() references
  the variable __initdata pcpu_build_alloc_info.mask.
  This is often because cpumask_clear_cpu lacks a __initdata
  annotation or the annotation of pcpu_build_alloc_info.mask is wrong.

clang output:
  mm/percpu.c:2724:5: remark: cpumask_clear_cpu not inlined into pcpu_build_alloc_info because too costly to inline (cost=725, threshold=325) [-Rpass-missed=inline]

[1] https://lore.kernel.org/linux-mm/202012220454.9F6Bkz9q-lkp@intel.com/
[2] https://lore.kernel.org/lkml/CAK8P3a2ZWfNeXKSm8K_SUhhwkor17jFo3xApLXjzfPqX0eUDUA@mail.gmail.com/

Reported-by: kernel test robot <lkp@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/compiler_attributes.h | 6 ++++++
 mm/percpu.c                         | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index ea5e04e75845..c043b8d2b17b 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -210,6 +210,12 @@
 # define fallthrough                    do {} while (0)  /* fallthrough */
 #endif
 
+/*
+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#flatten
+ */
+# define __flatten			__attribute__((flatten))
+
 /*
  * Note the missing underscores.
  *
diff --git a/mm/percpu.c b/mm/percpu.c
index 80f8f885a990..6596a0a4286e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2663,7 +2663,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * On success, pointer to the new allocation_info is returned.  On
  * failure, ERR_PTR value is returned.
  */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
 				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-- 
cgit v1.2.3


From 4590d98f5a4f466d17e5c81d7c9fc796da9a8cee Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 11 Feb 2021 15:40:02 +0200
Subject: sfi: Remove framework for deprecated firmware

SFI-based platforms are gone. So does this framework.

This removes mention of SFI through the drivers and other code as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-firmware-sfi       |  15 -
 Documentation/ABI/testing/sysfs-platform-kim       |   2 +-
 MAINTAINERS                                        |   7 -
 arch/x86/Kconfig                                   |   7 +-
 arch/x86/include/asm/intel-mid.h                   |  37 --
 arch/x86/include/asm/intel_scu_ipc_legacy.h        |  58 +--
 arch/x86/include/asm/platform_sst_audio.h          |   2 -
 arch/x86/kernel/apic/io_apic.c                     |   4 +-
 arch/x86/kernel/setup.c                            |   2 -
 arch/x86/pci/mmconfig-shared.c                     |   6 +-
 arch/x86/platform/Makefile                         |   1 -
 arch/x86/platform/intel-mid/Makefile               |   5 -
 arch/x86/platform/intel-mid/device_libs/Makefile   |  23 -
 .../intel-mid/device_libs/platform_bcm43xx.c       | 101 ----
 .../intel-mid/device_libs/platform_bma023.c        |  16 -
 .../platform/intel-mid/device_libs/platform_bt.c   | 101 ----
 .../intel-mid/device_libs/platform_emc1403.c       |  39 --
 .../intel-mid/device_libs/platform_gpio_keys.c     |  81 ----
 .../intel-mid/device_libs/platform_lis331.c        |  37 --
 .../intel-mid/device_libs/platform_max7315.c       |  77 ---
 .../intel-mid/device_libs/platform_mpu3050.c       |  32 --
 .../intel-mid/device_libs/platform_mrfld_pinctrl.c |  39 --
 .../intel-mid/device_libs/platform_mrfld_rtc.c     |  44 --
 .../intel-mid/device_libs/platform_mrfld_sd.c      |  43 --
 .../intel-mid/device_libs/platform_mrfld_spidev.c  |  50 --
 .../intel-mid/device_libs/platform_pcal9555a.c     |  95 ----
 .../intel-mid/device_libs/platform_tc35876x.c      |  42 --
 .../intel-mid/device_libs/platform_tca6416.c       |  53 ---
 arch/x86/platform/intel-mid/intel-mid.c            |   1 -
 arch/x86/platform/intel-mid/sfi.c                  | 419 -----------------
 arch/x86/platform/sfi/Makefile                     |   2 -
 arch/x86/platform/sfi/sfi.c                        | 100 ----
 drivers/Makefile                                   |   2 +-
 drivers/platform/x86/intel_scu_pcidrv.c            |  22 +-
 drivers/sfi/Kconfig                                |  18 -
 drivers/sfi/Makefile                               |   4 -
 drivers/sfi/sfi_acpi.c                             | 214 ---------
 drivers/sfi/sfi_core.c                             | 522 ---------------------
 drivers/sfi/sfi_core.h                             |  81 ----
 include/linux/sfi.h                                | 210 ---------
 include/linux/sfi_acpi.h                           |  93 ----
 init/main.c                                        |   2 -
 42 files changed, 14 insertions(+), 2695 deletions(-)
 delete mode 100644 Documentation/ABI/testing/sysfs-firmware-sfi
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/Makefile
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bma023.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bt.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_lis331.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_max7315.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
 delete mode 100644 arch/x86/platform/intel-mid/sfi.c
 delete mode 100644 arch/x86/platform/sfi/Makefile
 delete mode 100644 arch/x86/platform/sfi/sfi.c
 delete mode 100644 drivers/sfi/Kconfig
 delete mode 100644 drivers/sfi/Makefile
 delete mode 100644 drivers/sfi/sfi_acpi.c
 delete mode 100644 drivers/sfi/sfi_core.c
 delete mode 100644 drivers/sfi/sfi_core.h
 delete mode 100644 include/linux/sfi.h
 delete mode 100644 include/linux/sfi_acpi.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-firmware-sfi b/Documentation/ABI/testing/sysfs-firmware-sfi
deleted file mode 100644
index 5210e0f06ddb..000000000000
--- a/Documentation/ABI/testing/sysfs-firmware-sfi
+++ /dev/null
@@ -1,15 +0,0 @@
-What:		/sys/firmware/sfi/tables/
-Date:		May 2010
-Contact:	Len Brown <lenb@kernel.org>
-Description:
-		SFI defines a number of small static memory tables
-		so the kernel can get platform information from firmware.
-
-		The tables are defined in the latest SFI specification:
-		http://simplefirmware.org/documentation
-
-		While the tables are used by the kernel, user-space
-		can observe them this way::
-
-		  # cd /sys/firmware/sfi/tables
-		  # cat $TABLENAME > $TABLENAME.bin
diff --git a/Documentation/ABI/testing/sysfs-platform-kim b/Documentation/ABI/testing/sysfs-platform-kim
index a7f81de68046..6a52d6d2b601 100644
--- a/Documentation/ABI/testing/sysfs-platform-kim
+++ b/Documentation/ABI/testing/sysfs-platform-kim
@@ -7,7 +7,7 @@ Description:
 		is connected. example: "/dev/ttyS0".
 
 		The device name flows down to architecture specific board
-		initialization file from the SFI/ATAGS bootloader
+		initialization file from the ATAGS bootloader
 		firmware. The name exposed is read from the user-space
 		dameon and opens the device when install is requested.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 8fdb7fe27537..fd41b96d71bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16234,13 +16234,6 @@ S:	Maintained
 F:	Documentation/fb/sm712fb.rst
 F:	drivers/video/fbdev/sm712*
 
-SIMPLE FIRMWARE INTERFACE (SFI)
-S:	Obsolete
-W:	http://simplefirmware.org/
-F:	arch/x86/platform/sfi/
-F:	drivers/sfi/
-F:	include/linux/sfi*.h
-
 SIMPLEFB FB DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 L:	linux-fbdev@vger.kernel.org
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc73f98c3e42..7f5b77055266 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -444,7 +444,7 @@ config X86_X2APIC
 	  If you don't know what to do here, say N.
 
 config X86_MPPARSE
-	bool "Enable MPS table" if ACPI || SFI
+	bool "Enable MPS table" if ACPI
 	default y
 	depends on X86_LOCAL_APIC
 	help
@@ -603,7 +603,6 @@ config X86_INTEL_MID
 	depends on PCI
 	depends on X86_64 || (PCI_GOANY && X86_32)
 	depends on X86_IO_APIC
-	select SFI
 	select I2C
 	select DW_APB_TIMER
 	select APB_TIMER
@@ -2457,8 +2456,6 @@ source "kernel/power/Kconfig"
 
 source "drivers/acpi/Kconfig"
 
-source "drivers/sfi/Kconfig"
-
 config X86_APM_BOOT
 	def_bool y
 	depends on APM
@@ -2645,7 +2642,7 @@ config PCI_DIRECT
 config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access" if X86_64
 	default y
-	depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST)
+	depends on PCI && (ACPI || JAILHOUSE_GUEST)
 	depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG)
 
 config PCI_OLPC
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 7bda0587cf70..6306fe3e5c4a 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_X86_INTEL_MID_H
 #define _ASM_X86_INTEL_MID_H
 
-#include <linux/sfi.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 
@@ -22,39 +21,6 @@ extern void intel_mid_pwr_power_off(void);
 
 extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
 
-extern int get_gpio_by_name(const char *name);
-
-/*
- * Here defines the array of devices platform data that IAFW would export
- * through SFI "DEVS" table, we use name and type to match the device and
- * its platform data.
- */
-struct devs_id {
-	char name[SFI_NAME_LEN + 1];
-	u8 type;
-	u8 delay;
-	void *(*get_platform_data)(void *info);
-};
-
-#define sfi_device(i)								\
-	static const struct devs_id *const __intel_mid_sfi_##i##_dev __used	\
-	__section(".x86_intel_mid_dev.init") = &i
-
-/**
-* struct mid_sd_board_info - template for SD device creation
-* @name:		identifies the driver
-* @bus_num:		board-specific identifier for a given SD controller
-* @max_clk:		the maximum frequency device supports
-* @platform_data:	the particular data stored there is driver-specific
-*/
-struct mid_sd_board_info {
-	char		name[SFI_NAME_LEN];
-	int		bus_num;
-	unsigned short	addr;
-	u32		max_clk;
-	void		*platform_data;
-};
-
 /*
  * Medfield is the follow-up of Moorestown, it combines two chip solution into
  * one. Other than that it also added always-on and constant tsc and lapic
@@ -99,7 +65,4 @@ static inline void intel_scu_devices_destroy(void) { }
 /* FSB 83MHz */
 #define BSEL_SOC_FUSE_111		0x7
 
-/* The offset for the mapping of global gpio pin to irq */
-#define INTEL_MID_IRQ_OFFSET		0x100
-
 #endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h
index f8b87d8face7..fa529e5ec142 100644
--- a/arch/x86/include/asm/intel_scu_ipc_legacy.h
+++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h
@@ -2,73 +2,17 @@
 #ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
 #define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
 
-#include <linux/notifier.h>
-
-#define IPCMSG_INDIRECT_READ	0x02
-#define IPCMSG_INDIRECT_WRITE	0x05
+#include <linux/types.h>
 
 #define IPCMSG_COLD_OFF		0x80	/* Only for Tangier */
-
-#define IPCMSG_WARM_RESET	0xF0
 #define IPCMSG_COLD_RESET	0xF1
-#define IPCMSG_SOFT_RESET	0xF2
-#define IPCMSG_COLD_BOOT	0xF3
 
 /* Don't call these in new code - they will be removed eventually */
 
-/* Read a vector */
-static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
-{
-	return intel_scu_ipc_dev_readv(NULL, addr, data, len);
-}
-
-/* Write a vector */
-static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
-{
-	return intel_scu_ipc_dev_writev(NULL, addr, data, len);
-}
-
-/* Update single register based on the mask */
-static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask)
-{
-	return intel_scu_ipc_dev_update(NULL, addr, data, mask);
-}
-
 /* Issue commands to the SCU with or without data */
 static inline int intel_scu_ipc_simple_command(int cmd, int sub)
 {
 	return intel_scu_ipc_dev_simple_command(NULL, cmd, sub);
 }
 
-static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
-					u32 *out, int outlen)
-{
-	/* New API takes both inlen and outlen as bytes so convert here */
-	size_t inbytes = inlen * sizeof(u32);
-	size_t outbytes = outlen * sizeof(u32);
-
-	return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes,
-						   inlen, out, outbytes);
-}
-
-extern struct blocking_notifier_head intel_scu_notifier;
-
-static inline void intel_scu_notifier_add(struct notifier_block *nb)
-{
-	blocking_notifier_chain_register(&intel_scu_notifier, nb);
-}
-
-static inline void intel_scu_notifier_remove(struct notifier_block *nb)
-{
-	blocking_notifier_chain_unregister(&intel_scu_notifier, nb);
-}
-
-static inline int intel_scu_notifier_post(unsigned long v, void *p)
-{
-	return blocking_notifier_call_chain(&intel_scu_notifier, v, p);
-}
-
-#define		SCU_AVAILABLE		1
-#define		SCU_DOWN		2
-
 #endif
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 16b9f220bdeb..40f92270515b 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -10,8 +10,6 @@
 #ifndef _PLATFORM_SST_AUDIO_H_
 #define _PLATFORM_SST_AUDIO_H_
 
-#include <linux/sfi.h>
-
 #define MAX_NUM_STREAMS_MRFLD	25
 #define MAX_NUM_STREAMS	MAX_NUM_STREAMS_MRFLD
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e4ab4804b20d..c3b60c37c728 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -198,7 +198,7 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
 {
 	int i;
@@ -2863,7 +2863,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 
 	/*
 	 * If mp_register_ioapic() is called during early boot stage when
-	 * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+	 * walking ACPI/DT tables, it's too early to create irqdomain,
 	 * we are still using bootmem allocator. So delay it to setup_IO_APIC().
 	 */
 	if (hotplug) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 740f3bdb3f61..d883176ef2ce 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -16,7 +16,6 @@
 #include <linux/memblock.h>
 #include <linux/pci.h>
 #include <linux/root_dev.h>
-#include <linux/sfi.h>
 #include <linux/hugetlb.h>
 #include <linux/tboot.h>
 #include <linux/usb/xhci-dbgp.h>
@@ -1185,7 +1184,6 @@ void __init setup_arch(char **cmdline_p)
 	 * Read APIC and some other early information from ACPI tables.
 	 */
 	acpi_boot_init();
-	sfi_init();
 	x86_dtb_init();
 
 	/*
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 234998f196d4..de6bf0e7e8f8 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -11,9 +11,9 @@
  * themselves.
  */
 
+#include <linux/acpi.h>
 #include <linux/pci.h>
 #include <linux/init.h>
-#include <linux/sfi_acpi.h>
 #include <linux/bitmap.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
@@ -665,7 +665,7 @@ void __init pci_mmcfg_early_init(void)
 		if (pci_mmcfg_check_hostbridge())
 			known_bridge = 1;
 		else
-			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+			acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 		__pci_mmcfg_init(1);
 
 		set_apei_filter();
@@ -683,7 +683,7 @@ void __init pci_mmcfg_late_init(void)
 
 	/* MMCONFIG hasn't been enabled yet, try again */
 	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
-		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 		__pci_mmcfg_init(0);
 	}
 }
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01..d1f5228225d9 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -11,6 +11,5 @@ obj-y	+= intel-mid/
 obj-y	+= intel-quark/
 obj-y	+= olpc/
 obj-y	+= scx200/
-obj-y	+= sfi/
 obj-y	+= ts5500/
 obj-y	+= uv/
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 5794e661050c..ddfc08783fb8 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,7 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o pwr.o
-
-# SFI specific code
-ifdef CONFIG_X86_INTEL_MID
-obj-$(CONFIG_SFI) += sfi.o device_libs/
-endif
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
deleted file mode 100644
index 4d008b053ac8..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Family-Level Interface Shim (FLIS)
-obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
-# SDHCI Devices
-obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
-# WiFi + BT
-obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
-obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o
-# I2C Devices
-obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
-obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
-obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o
-obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
-obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# I2C GPIO Expanders
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
-# MISC Devices
-obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
-obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
deleted file mode 100644
index 564c47c53f3a..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bcm43xx.c: bcm43xx platform data initialization file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/machine.h>
-#include <linux/regulator/fixed.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define WLAN_SFI_GPIO_IRQ_NAME		"WLAN-interrupt"
-#define WLAN_SFI_GPIO_ENABLE_NAME	"WLAN-enable"
-
-#define WLAN_DEV_NAME			"0000:00:01.3"
-
-static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
-	.dev_name		= WLAN_DEV_NAME,
-	.supply			= "vmmc",
-};
-
-static struct regulator_init_data bcm43xx_vmmc_data = {
-	.constraints = {
-		.valid_ops_mask		= REGULATOR_CHANGE_STATUS,
-	},
-	.num_consumer_supplies	= 1,
-	.consumer_supplies	= &bcm43xx_vmmc_supply,
-};
-
-static struct fixed_voltage_config bcm43xx_vmmc = {
-	.supply_name		= "bcm43xx-vmmc-regulator",
-	/*
-	 * Announce 2.0V here to be compatible with SDIO specification. The
-	 * real voltage and signaling are still 1.8V.
-	 */
-	.microvolts		= 2000000,		/* 1.8V */
-	.startup_delay		= 250 * 1000,		/* 250ms */
-	.enabled_at_boot	= 0,			/* disabled at boot */
-	.init_data		= &bcm43xx_vmmc_data,
-};
-
-static struct platform_device bcm43xx_vmmc_regulator = {
-	.name		= "reg-fixed-voltage",
-	.id		= PLATFORM_DEVID_AUTO,
-	.dev = {
-		.platform_data	= &bcm43xx_vmmc,
-	},
-};
-
-static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
-	.dev_id	= "reg-fixed-voltage.0",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW),
-		{}
-	},
-};
-
-static int __init bcm43xx_regulator_register(void)
-{
-	struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-	int ret;
-
-	lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
-	gpiod_add_lookup_table(table);
-
-	ret = platform_device_register(&bcm43xx_vmmc_regulator);
-	if (ret) {
-		pr_err("%s: vmmc regulator register failed\n", __func__);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void __init *bcm43xx_platform_data(void *info)
-{
-	int ret;
-
-	ret = bcm43xx_regulator_register();
-	if (ret)
-		return NULL;
-
-	pr_info("Using generic wifi platform data\n");
-
-	/* For now it's empty */
-	return NULL;
-}
-
-static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
-	.name			= "bcm43xx_clk_vmmc",
-	.type			= SFI_DEV_TYPE_SD,
-	.get_platform_data	= &bcm43xx_platform_data,
-};
-
-sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
deleted file mode 100644
index 32912a17f68e..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bma023.c: bma023 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- */
-
-#include <asm/intel-mid.h>
-
-static const struct devs_id bma023_dev_id __initconst = {
-	.name = "bma023",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-};
-
-sfi_device(bma023_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
deleted file mode 100644
index 31dda18bb370..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Bluetooth platform data initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/pci.h>
-#include <linux/platform_device.h>
-
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-#include <asm/intel-mid.h>
-
-struct bt_sfi_data {
-	struct device *dev;
-	const char *name;
-	int (*setup)(struct bt_sfi_data *ddata);
-};
-
-static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
-	.dev_id	= "hci_bcm",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown",      GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup",   GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
-#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP	"bt_wakeup"
-#define TNG_BT_SFI_GPIO_SHUTDOWN	"BT-reset"
-#define TNG_BT_SFI_GPIO_HOST_WAKEUP	"bt_uart_enable"
-
-static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
-{
-	struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-	struct pci_dev *pdev;
-
-	/* Connected to /dev/ttyS0 */
-	pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
-	if (!pdev)
-		return -ENODEV;
-
-	ddata->dev = &pdev->dev;
-	ddata->name = table->dev_id;
-
-	lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
-	lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
-	lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
-
-	gpiod_add_lookup_table(table);
-	return 0;
-}
-
-static struct bt_sfi_data tng_bt_sfi_data __initdata = {
-	.setup	= tng_bt_sfi_setup,
-};
-
-static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
-	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&tng_bt_sfi_data),
-	{}
-};
-
-static int __init bt_sfi_init(void)
-{
-	struct platform_device_info info;
-	struct platform_device *pdev;
-	const struct x86_cpu_id *id;
-	struct bt_sfi_data *ddata;
-	int ret;
-
-	id = x86_match_cpu(bt_sfi_cpu_ids);
-	if (!id)
-		return -ENODEV;
-
-	ddata = (struct bt_sfi_data *)id->driver_data;
-	if (!ddata)
-		return -ENODEV;
-
-	ret = ddata->setup(ddata);
-	if (ret)
-		return ret;
-
-	memset(&info, 0, sizeof(info));
-	info.fwnode	= ddata->dev->fwnode;
-	info.parent	= ddata->dev;
-	info.name	= ddata->name,
-	info.id		= PLATFORM_DEVID_NONE,
-
-	pdev = platform_device_register_full(&info);
-	if (IS_ERR(pdev))
-		return PTR_ERR(pdev);
-
-	dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
-	return 0;
-}
-device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
deleted file mode 100644
index a2508582a0b1..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_emc1403.c: emc1403 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void __init *emc1403_platform_data(void *info)
-{
-	static short intr2nd_pdata;
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("thermal_int");
-	int intr2nd = get_gpio_by_name("thermal_alert");
-
-	if (intr < 0)
-		return NULL;
-	if (intr2nd < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
-	return &intr2nd_pdata;
-}
-
-static const struct devs_id emc1403_dev_id __initconst = {
-	.name = "emc1403",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &emc1403_platform_data,
-};
-
-sfi_device(emc1403_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
deleted file mode 100644
index d9435d2196a4..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_gpio_keys.c: gpio_keys platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/input.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/platform_device.h>
-#include <asm/intel-mid.h>
-
-#define DEVICE_NAME "gpio-keys"
-
-/*
- * we will search these buttons in SFI GPIO table (by name)
- * and register them dynamically. Please add all possible
- * buttons here, we will shrink them if no GPIO found.
- */
-static struct gpio_keys_button gpio_button[] = {
-	{KEY_POWER,		-1, 1, "power_btn",	EV_KEY, 0, 3000},
-	{KEY_PROG1,		-1, 1, "prog_btn1",	EV_KEY, 0, 20},
-	{KEY_PROG2,		-1, 1, "prog_btn2",	EV_KEY, 0, 20},
-	{SW_LID,		-1, 1, "lid_switch",	EV_SW,  0, 20},
-	{KEY_VOLUMEUP,		-1, 1, "vol_up",	EV_KEY, 0, 20},
-	{KEY_VOLUMEDOWN,	-1, 1, "vol_down",	EV_KEY, 0, 20},
-	{KEY_MUTE,		-1, 1, "mute_enable",	EV_KEY, 0, 20},
-	{KEY_VOLUMEUP,		-1, 1, "volume_up",	EV_KEY, 0, 20},
-	{KEY_VOLUMEDOWN,	-1, 1, "volume_down",	EV_KEY, 0, 20},
-	{KEY_CAMERA,		-1, 1, "camera_full",	EV_KEY, 0, 20},
-	{KEY_CAMERA_FOCUS,	-1, 1, "camera_half",	EV_KEY, 0, 20},
-	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw1",	EV_SW,  0, 20},
-	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw2",	EV_SW,  0, 20},
-};
-
-static struct gpio_keys_platform_data gpio_keys = {
-	.buttons	= gpio_button,
-	.rep		= 1,
-	.nbuttons	= -1, /* will fill it after search */
-};
-
-static struct platform_device pb_device = {
-	.name		= DEVICE_NAME,
-	.id		= -1,
-	.dev		= {
-		.platform_data	= &gpio_keys,
-	},
-};
-
-/*
- * Shrink the non-existent buttons, register the gpio button
- * device if there is some
- */
-static int __init pb_keys_init(void)
-{
-	struct gpio_keys_button *gb = gpio_button;
-	int i, good = 0;
-
-	for (i = 0; i < ARRAY_SIZE(gpio_button); i++) {
-		gb[i].gpio = get_gpio_by_name(gb[i].desc);
-		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
-					gb[i].gpio);
-		if (gb[i].gpio < 0)
-			continue;
-
-		if (i != good)
-			gb[good] = gb[i];
-		good++;
-	}
-
-	if (good) {
-		gpio_keys.nbuttons = good;
-		return platform_device_register(&pb_device);
-	}
-	return 0;
-}
-late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
deleted file mode 100644
index a4485cd638c6..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_lis331.c:  lis331 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-static void __init *lis331dl_platform_data(void *info)
-{
-	static short intr2nd_pdata;
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("accel_int");
-	int intr2nd = get_gpio_by_name("accel_2");
-
-	if (intr < 0)
-		return NULL;
-	if (intr2nd < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
-	return &intr2nd_pdata;
-}
-
-static const struct devs_id lis331dl_dev_id __initconst = {
-	.name = "i2c_accel",
-	.type = SFI_DEV_TYPE_I2C,
-	.get_platform_data = &lis331dl_platform_data,
-};
-
-sfi_device(lis331dl_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
deleted file mode 100644
index e9287c3184da..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_max7315.c: max7315 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <asm/intel-mid.h>
-
-#define MAX7315_NUM 2
-
-static void __init *max7315_platform_data(void *info)
-{
-	static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
-	static int nr;
-	struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
-	struct i2c_board_info *i2c_info = info;
-	int gpio_base, intr;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-
-	if (nr == MAX7315_NUM) {
-		pr_err("too many max7315s, we only support %d\n",
-				MAX7315_NUM);
-		return NULL;
-	}
-	/* we have several max7315 on the board, we only need load several
-	 * instances of the same pca953x driver to cover them
-	 */
-	strcpy(i2c_info->type, "max7315");
-	if (nr++) {
-		snprintf(base_pin_name, sizeof(base_pin_name),
-			 "max7315_%d_base", nr);
-		snprintf(intr_pin_name, sizeof(intr_pin_name),
-			 "max7315_%d_int", nr);
-	} else {
-		strcpy(base_pin_name, "max7315_base");
-		strcpy(intr_pin_name, "max7315_int");
-	}
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	if (gpio_base < 0)
-		return NULL;
-	max7315->gpio_base = gpio_base;
-	if (intr != -1) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		max7315->irq_base = -1;
-	}
-	return max7315;
-}
-
-static const struct devs_id max7315_dev_id __initconst = {
-	.name = "i2c_max7315",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &max7315_platform_data,
-};
-
-static const struct devs_id max7315_2_dev_id __initconst = {
-	.name = "i2c_max7315_2",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &max7315_platform_data,
-};
-
-sfi_device(max7315_dev_id);
-sfi_device(max7315_2_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
deleted file mode 100644
index 28a182713934..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_mpu3050.c: mpu3050 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void *mpu3050_platform_data(void *info)
-{
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("mpu3050_int");
-
-	if (intr < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	return NULL;
-}
-
-static const struct devs_id mpu3050_dev_id __initconst = {
-	.name = "mpu3050",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &mpu3050_platform_data,
-};
-
-sfi_device(mpu3050_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
deleted file mode 100644
index 605e1f94ad89..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield FLIS platform device initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/platform_device.h>
-
-#include <asm/intel-mid.h>
-
-#define FLIS_BASE_ADDR			0xff0c0000
-#define FLIS_LENGTH			0x8000
-
-static struct resource mrfld_pinctrl_mmio_resource = {
-	.start		= FLIS_BASE_ADDR,
-	.end		= FLIS_BASE_ADDR + FLIS_LENGTH - 1,
-	.flags		= IORESOURCE_MEM,
-};
-
-static struct platform_device mrfld_pinctrl_device = {
-	.name		= "pinctrl-merrifield",
-	.id		= PLATFORM_DEVID_NONE,
-	.resource	= &mrfld_pinctrl_mmio_resource,
-	.num_resources	= 1,
-};
-
-static int __init mrfld_pinctrl_init(void)
-{
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-		return platform_device_register(&mrfld_pinctrl_device);
-
-	return -ENODEV;
-}
-arch_initcall(mrfld_pinctrl_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
deleted file mode 100644
index 40e9808a9634..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield legacy RTC initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-
-#include <asm/hw_irq.h>
-#include <asm/intel-mid.h>
-#include <asm/io_apic.h>
-#include <asm/time.h>
-#include <asm/x86_init.h>
-
-static int __init mrfld_legacy_rtc_alloc_irq(void)
-{
-	struct irq_alloc_info info;
-	int ret;
-
-	if (!x86_platform.legacy.rtc)
-		return -ENODEV;
-
-	ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
-	ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
-	if (ret < 0) {
-		pr_info("Failed to allocate RTC interrupt. Disabling RTC\n");
-		x86_platform.legacy.rtc = 0;
-		return ret;
-	}
-
-	return 0;
-}
-
-static int __init mrfld_legacy_rtc_init(void)
-{
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return -ENODEV;
-
-	return mrfld_legacy_rtc_alloc_irq();
-}
-arch_initcall(mrfld_legacy_rtc_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
deleted file mode 100644
index fe3b7ff975f3..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SDHCI platform data initilisation file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/pci.h>
-
-#include <linux/mmc/sdhci-pci-data.h>
-
-#include <asm/intel-mid.h>
-
-#define INTEL_MRFLD_SD			2
-#define INTEL_MRFLD_SD_CD_GPIO		77
-
-static struct sdhci_pci_data mrfld_sdhci_pci_data = {
-	.rst_n_gpio	= -EINVAL,
-	.cd_gpio	= INTEL_MRFLD_SD_CD_GPIO,
-};
-
-static struct sdhci_pci_data *
-mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
-{
-	unsigned int func = PCI_FUNC(pdev->devfn);
-
-	if (func == INTEL_MRFLD_SD)
-		return &mrfld_sdhci_pci_data;
-
-	return NULL;
-}
-
-static int __init mrfld_sd_init(void)
-{
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return -ENODEV;
-
-	sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
-	return 0;
-}
-arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
deleted file mode 100644
index b828f4fd40be..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * spidev platform data initialization file
- *
- * (C) Copyright 2014, 2016 Intel Corporation
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- *	    Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/spi/pxa2xx_spi.h>
-#include <linux/spi/spi.h>
-
-#include <asm/intel-mid.h>
-
-#define MRFLD_SPI_DEFAULT_DMA_BURST	8
-#define MRFLD_SPI_DEFAULT_TIMEOUT	500
-
-/* GPIO pin for spidev chipselect */
-#define MRFLD_SPIDEV_GPIO_CS		111
-
-static struct pxa2xx_spi_chip spidev_spi_chip = {
-	.dma_burst_size		= MRFLD_SPI_DEFAULT_DMA_BURST,
-	.timeout		= MRFLD_SPI_DEFAULT_TIMEOUT,
-	.gpio_cs		= MRFLD_SPIDEV_GPIO_CS,
-};
-
-static void __init *spidev_platform_data(void *info)
-{
-	struct spi_board_info *spi_info = info;
-
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return ERR_PTR(-ENODEV);
-
-	spi_info->mode = SPI_MODE_0;
-	spi_info->controller_data = &spidev_spi_chip;
-
-	return NULL;
-}
-
-static const struct devs_id spidev_dev_id __initconst = {
-	.name			= "spidev",
-	.type			= SFI_DEV_TYPE_SPI,
-	.delay			= 0,
-	.get_platform_data	= &spidev_platform_data,
-};
-
-sfi_device(spidev_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
deleted file mode 100644
index 5609d8da3978..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * PCAL9555a platform data initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- *	    Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define PCAL9555A_NUM	4
-
-static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM];
-static int nr;
-
-static void __init *pcal9555a_platform_data(void *info)
-{
-	struct i2c_board_info *i2c_info = info;
-	char *type = i2c_info->type;
-	struct pca953x_platform_data *pcal9555a;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-	int gpio_base, intr;
-
-	snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type);
-	snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type);
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	/* Check if the SFI record valid */
-	if (gpio_base == -1)
-		return NULL;
-
-	if (nr >= PCAL9555A_NUM) {
-		pr_err("%s: Too many instances, only %d supported\n", __func__,
-		       PCAL9555A_NUM);
-		return NULL;
-	}
-
-	pcal9555a = &pcal9555a_pdata[nr++];
-	pcal9555a->gpio_base = gpio_base;
-
-	if (intr >= 0) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		pcal9555a->irq_base = -1;
-	}
-
-	strcpy(type, "pcal9555a");
-	return pcal9555a;
-}
-
-static const struct devs_id pcal9555a_1_dev_id __initconst = {
-	.name			= "pcal9555a-1",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_2_dev_id __initconst = {
-	.name			= "pcal9555a-2",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_3_dev_id __initconst = {
-	.name			= "pcal9555a-3",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_4_dev_id __initconst = {
-	.name			= "pcal9555a-4",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-sfi_device(pcal9555a_1_dev_id);
-sfi_device(pcal9555a_2_dev_id);
-sfi_device(pcal9555a_3_dev_id);
-sfi_device(pcal9555a_4_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
deleted file mode 100644
index 139738bbdd36..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tc35876x.c: tc35876x platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <asm/intel-mid.h>
-
-static struct gpiod_lookup_table tc35876x_gpio_table = {
-	.dev_id	= "i2c_disp_brig",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, "bridge-reset", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "bl-en", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "vadd", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
-/*tc35876x DSI_LVDS bridge chip and panel platform data*/
-static void *tc35876x_platform_data(void *data)
-{
-	struct gpiod_lookup_table *table = &tc35876x_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-
-	lookup[0].chip_hwnum = get_gpio_by_name("LCMB_RXEN");
-	lookup[1].chip_hwnum = get_gpio_by_name("6S6P_BL_EN");
-	lookup[2].chip_hwnum = get_gpio_by_name("EN_VREG_LCD_V3P3");
-	gpiod_add_lookup_table(table);
-
-	return NULL;
-}
-
-static const struct devs_id tc35876x_dev_id __initconst = {
-	.name = "i2c_disp_brig",
-	.type = SFI_DEV_TYPE_I2C,
-	.get_platform_data = &tc35876x_platform_data,
-};
-
-sfi_device(tc35876x_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
deleted file mode 100644
index e689d8f61059..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tca6416.c: tca6416 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/platform_data/pca953x.h>
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-#define TCA6416_NAME	"tca6416"
-#define TCA6416_BASE	"tca6416_base"
-#define TCA6416_INTR	"tca6416_int"
-
-static void *tca6416_platform_data(void *info)
-{
-	static struct pca953x_platform_data tca6416;
-	struct i2c_board_info *i2c_info = info;
-	int gpio_base, intr;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-
-	strcpy(i2c_info->type, TCA6416_NAME);
-	strcpy(base_pin_name, TCA6416_BASE);
-	strcpy(intr_pin_name, TCA6416_INTR);
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	if (gpio_base < 0)
-		return NULL;
-	tca6416.gpio_base = gpio_base;
-	if (intr >= 0) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		tca6416.irq_base = -1;
-	}
-	return &tca6416;
-}
-
-static const struct devs_id tca6416_dev_id __initconst = {
-	.name = "tca6416",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &tca6416_platform_data,
-};
-
-sfi_device(tca6416_dev_id);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 864b0c158b2f..2c044e260b4b 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -14,7 +14,6 @@
 #include <linux/interrupt.h>
 #include <linux/regulator/machine.h>
 #include <linux/scatterlist.h>
-#include <linux/sfi.h>
 #include <linux/irq.h>
 #include <linux/export.h>
 #include <linux/notifier.h>
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
deleted file mode 100644
index 63ae342ffb12..000000000000
--- a/arch/x86/platform/intel-mid/sfi.c
+++ /dev/null
@@ -1,419 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * intel_mid_sfi.c: Intel MID SFI initialization code
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/spi/spi.h>
-#include <linux/i2c.h>
-#include <linux/skbuff.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-#include <linux/export.h>
-#include <linux/notifier.h>
-#include <linux/mmc/core.h>
-#include <linux/mmc/card.h>
-#include <linux/blkdev.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/intel-mid.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/reboot.h>
-
-#define	SFI_SIG_OEM0	"OEM0"
-#define MAX_IPCDEVS	24
-#define MAX_SCU_SPI	24
-#define MAX_SCU_I2C	24
-
-static struct platform_device *ipc_devs[MAX_IPCDEVS];
-static struct spi_board_info *spi_devs[MAX_SCU_SPI];
-static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
-static struct sfi_gpio_table_entry *gpio_table;
-static int ipc_next_dev;
-static int spi_next_dev;
-static int i2c_next_dev;
-static int i2c_bus[MAX_SCU_I2C];
-static int gpio_num_entry;
-
-struct blocking_notifier_head intel_scu_notifier =
-			BLOCKING_NOTIFIER_INIT(intel_scu_notifier);
-EXPORT_SYMBOL_GPL(intel_scu_notifier);
-
-#define intel_mid_sfi_get_pdata(dev, priv)	\
-	((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL)
-
-/*
- * Parsing GPIO table first, since the DEVS table will need this table
- * to map the pin name to the actual pin.
- */
-static int __init sfi_parse_gpio(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_gpio_table_entry *pentry;
-	int num, i;
-
-	if (gpio_table)
-		return 0;
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
-	pentry = (struct sfi_gpio_table_entry *)sb->pentry;
-
-	gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
-	if (!gpio_table)
-		return -1;
-	gpio_num_entry = num;
-
-	pr_debug("GPIO pin info:\n");
-	for (i = 0; i < num; i++, pentry++)
-		pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
-		" pin = %d\n", i,
-			pentry->controller_name,
-			pentry->pin_name,
-			pentry->pin_no);
-	return 0;
-}
-
-int get_gpio_by_name(const char *name)
-{
-	struct sfi_gpio_table_entry *pentry = gpio_table;
-	int i;
-
-	if (!pentry)
-		return -1;
-	for (i = 0; i < gpio_num_entry; i++, pentry++) {
-		if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
-			return pentry->pin_no;
-	}
-	return -EINVAL;
-}
-
-static void __init intel_scu_ipc_device_register(struct platform_device *pdev)
-{
-	if (ipc_next_dev == MAX_IPCDEVS)
-		pr_err("too many SCU IPC devices");
-	else
-		ipc_devs[ipc_next_dev++] = pdev;
-}
-
-static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
-{
-	struct spi_board_info *new_dev;
-
-	if (spi_next_dev == MAX_SCU_SPI) {
-		pr_err("too many SCU SPI devices");
-		return;
-	}
-
-	new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
-	if (!new_dev) {
-		pr_err("failed to alloc mem for delayed spi dev %s\n",
-			sdev->modalias);
-		return;
-	}
-	*new_dev = *sdev;
-
-	spi_devs[spi_next_dev++] = new_dev;
-}
-
-static void __init intel_scu_i2c_device_register(int bus,
-						struct i2c_board_info *idev)
-{
-	struct i2c_board_info *new_dev;
-
-	if (i2c_next_dev == MAX_SCU_I2C) {
-		pr_err("too many SCU I2C devices");
-		return;
-	}
-
-	new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
-	if (!new_dev) {
-		pr_err("failed to alloc mem for delayed i2c dev %s\n",
-			idev->type);
-		return;
-	}
-	*new_dev = *idev;
-
-	i2c_bus[i2c_next_dev] = bus;
-	i2c_devs[i2c_next_dev++] = new_dev;
-}
-
-/* Called by IPC driver */
-void intel_scu_devices_create(void)
-{
-	int i;
-
-	for (i = 0; i < ipc_next_dev; i++)
-		platform_device_add(ipc_devs[i]);
-
-	for (i = 0; i < spi_next_dev; i++)
-		spi_register_board_info(spi_devs[i], 1);
-
-	for (i = 0; i < i2c_next_dev; i++) {
-		struct i2c_adapter *adapter;
-		struct i2c_client *client;
-
-		adapter = i2c_get_adapter(i2c_bus[i]);
-		if (adapter) {
-			client = i2c_new_client_device(adapter, i2c_devs[i]);
-			if (IS_ERR(client))
-				pr_err("can't create i2c device %s\n",
-					i2c_devs[i]->type);
-		} else
-			i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
-	}
-	intel_scu_notifier_post(SCU_AVAILABLE, NULL);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_create);
-
-/* Called by IPC driver */
-void intel_scu_devices_destroy(void)
-{
-	int i;
-
-	intel_scu_notifier_post(SCU_DOWN, NULL);
-
-	for (i = 0; i < ipc_next_dev; i++)
-		platform_device_del(ipc_devs[i]);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
-
-static void __init install_irq_resource(struct platform_device *pdev, int irq)
-{
-	/* Single threaded */
-	static struct resource res __initdata = {
-		.name = "IRQ",
-		.flags = IORESOURCE_IRQ,
-	};
-	res.start = irq;
-	platform_device_add_resources(pdev, &res, 1);
-}
-
-static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct platform_device *pdev;
-	void *pdata = NULL;
-
-	pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
-		pentry->name, pentry->irq);
-
-	/*
-	 * We need to call platform init of IPC devices to fill misc_pdata
-	 * structure. It will be used in msic_init for initialization.
-	 */
-	pdata = intel_mid_sfi_get_pdata(dev, pentry);
-	if (IS_ERR(pdata))
-		return;
-
-	pdev = platform_device_alloc(pentry->name, 0);
-	if (pdev == NULL) {
-		pr_err("out of memory for SFI platform device '%s'.\n",
-			pentry->name);
-		return;
-	}
-	install_irq_resource(pdev, pentry->irq);
-
-	pdev->dev.platform_data = pdata;
-	if (dev->delay)
-		intel_scu_ipc_device_register(pdev);
-	else
-		platform_device_add(pdev);
-}
-
-static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct spi_board_info spi_info;
-	void *pdata = NULL;
-
-	memset(&spi_info, 0, sizeof(spi_info));
-	strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
-	spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
-	spi_info.bus_num = pentry->host_num;
-	spi_info.chip_select = pentry->addr;
-	spi_info.max_speed_hz = pentry->max_freq;
-	pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n",
-		spi_info.bus_num,
-		spi_info.modalias,
-		spi_info.irq,
-		spi_info.max_speed_hz,
-		spi_info.chip_select);
-
-	pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
-	if (IS_ERR(pdata))
-		return;
-
-	spi_info.platform_data = pdata;
-	if (dev->delay)
-		intel_scu_spi_device_register(&spi_info);
-	else
-		spi_register_board_info(&spi_info, 1);
-}
-
-static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct i2c_board_info i2c_info;
-	void *pdata = NULL;
-
-	memset(&i2c_info, 0, sizeof(i2c_info));
-	strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
-	i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
-	i2c_info.addr = pentry->addr;
-	pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n",
-		pentry->host_num,
-		i2c_info.type,
-		i2c_info.irq,
-		i2c_info.addr);
-	pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
-	i2c_info.platform_data = pdata;
-	if (IS_ERR(pdata))
-		return;
-
-	if (dev->delay)
-		intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
-	else
-		i2c_register_board_info(pentry->host_num, &i2c_info, 1);
-}
-
-static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct mid_sd_board_info sd_info;
-	void *pdata;
-
-	memset(&sd_info, 0, sizeof(sd_info));
-	strncpy(sd_info.name, pentry->name, SFI_NAME_LEN);
-	sd_info.bus_num = pentry->host_num;
-	sd_info.max_clk = pentry->max_freq;
-	sd_info.addr = pentry->addr;
-	pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n",
-		 sd_info.bus_num,
-		 sd_info.name,
-		 sd_info.max_clk,
-		 sd_info.addr);
-	pdata = intel_mid_sfi_get_pdata(dev, &sd_info);
-	if (IS_ERR(pdata))
-		return;
-
-	/* Nothing we can do with this for now */
-	sd_info.platform_data = pdata;
-
-	pr_debug("Successfully registered %16.16s", sd_info.name);
-}
-
-extern struct devs_id *const __x86_intel_mid_dev_start[],
-		      *const __x86_intel_mid_dev_end[];
-
-static struct devs_id __init *get_device_id(u8 type, char *name)
-{
-	struct devs_id *const *dev_table;
-
-	for (dev_table = __x86_intel_mid_dev_start;
-			dev_table < __x86_intel_mid_dev_end; dev_table++) {
-		struct devs_id *dev = *dev_table;
-		if (dev->type == type &&
-			!strncmp(dev->name, name, SFI_NAME_LEN)) {
-			return dev;
-		}
-	}
-
-	return NULL;
-}
-
-static int __init sfi_parse_devs(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_device_table_entry *pentry;
-	struct devs_id *dev = NULL;
-	int num, i, ret;
-	int polarity;
-	struct irq_alloc_info info;
-
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
-	pentry = (struct sfi_device_table_entry *)sb->pentry;
-
-	for (i = 0; i < num; i++, pentry++) {
-		int irq = pentry->irq;
-
-		if (irq != (u8)0xff) { /* native RTE case */
-			/* these SPI2 devices are not exposed to system as PCI
-			 * devices, but they have separate RTE entry in IOAPIC
-			 * so we have to enable them one by one here
-			 */
-			if (intel_mid_identify_cpu() ==
-					INTEL_MID_CPU_CHIP_TANGIER) {
-				if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
-					/* active low */
-					polarity = 1;
-				else if (!strncmp(pentry->name,
-						"synaptics_3202", 14))
-					/* active low */
-					polarity = 1;
-				else if (irq == 41)
-					/* fast_int_1 */
-					polarity = 1;
-				else
-					/* active high */
-					polarity = 0;
-			} else {
-				/* PNW and CLV go with active low */
-				polarity = 1;
-			}
-
-			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
-			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
-			WARN_ON(ret < 0);
-		}
-
-		dev = get_device_id(pentry->type, pentry->name);
-
-		if (!dev)
-			continue;
-
-		switch (pentry->type) {
-		case SFI_DEV_TYPE_IPC:
-			sfi_handle_ipc_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_SPI:
-			sfi_handle_spi_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_I2C:
-			sfi_handle_i2c_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_SD:
-			sfi_handle_sd_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_UART:
-		case SFI_DEV_TYPE_HSI:
-		default:
-			break;
-		}
-	}
-	return 0;
-}
-
-static int __init intel_mid_platform_init(void)
-{
-	sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
-	sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
-	return 0;
-}
-arch_initcall(intel_mid_platform_init);
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
deleted file mode 100644
index 4eba24c2af67..000000000000
--- a/arch/x86/platform/sfi/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_SFI)		+= sfi.o
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
deleted file mode 100644
index 6259563760f9..000000000000
--- a/arch/x86/platform/sfi/sfi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- */
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-
-#include <asm/irqdomain.h>
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-
-/* All CPUs enumerated by SFI must be present and enabled */
-static void __init mp_sfi_register_lapic(u8 id)
-{
-	if (MAX_LOCAL_APIC - id <= 0) {
-		pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC);
-		return;
-	}
-
-	pr_info("registering lapic[%d]\n", id);
-
-	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_cpu_table_entry *pentry;
-	int i;
-	int cpu_num;
-
-	sb = (struct sfi_table_simple *)table;
-	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
-	pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-
-	for (i = 0; i < cpu_num; i++) {
-		mp_sfi_register_lapic(pentry->apic_id);
-		pentry++;
-	}
-
-	smp_found_config = 1;
-	return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#ifdef CONFIG_X86_IO_APIC
-
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_apic_table_entry *pentry;
-	int i, num;
-	struct ioapic_domain_cfg cfg = {
-		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &mp_ioapic_irqdomain_ops,
-	};
-
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
-	pentry = (struct sfi_apic_table_entry *)sb->pentry;
-
-	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
-		pentry++;
-	}
-
-	WARN(pic_mode, KERN_WARNING
-		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
-	pic_mode = 0;
-	return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	register_lapic_address(sfi_lapic_addr);
-	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
-	return 0;
-}
diff --git a/drivers/Makefile b/drivers/Makefile
index fd11b9ac4cc3..b2af1514a6b3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -27,7 +27,7 @@ obj-y				+= idle/
 obj-y				+= char/ipmi/
 
 obj-$(CONFIG_ACPI)		+= acpi/
-obj-$(CONFIG_SFI)		+= sfi/
+
 # PnP must come after ACPI since it will eventually need to check if acpi
 # was used and do nothing if so
 obj-$(CONFIG_PNP)		+= pnp/
diff --git a/drivers/platform/x86/intel_scu_pcidrv.c b/drivers/platform/x86/intel_scu_pcidrv.c
index 8c5fd8240da9..80abc708e4f2 100644
--- a/drivers/platform/x86/intel_scu_pcidrv.c
+++ b/drivers/platform/x86/intel_scu_pcidrv.c
@@ -17,7 +17,6 @@
 static int intel_scu_pci_probe(struct pci_dev *pdev,
 			       const struct pci_device_id *id)
 {
-	void (*setup_fn)(void) = (void (*)(void))id->driver_data;
 	struct intel_scu_ipc_data scu_data = {};
 	struct intel_scu_ipc_dev *scu;
 	int ret;
@@ -30,27 +29,14 @@ static int intel_scu_pci_probe(struct pci_dev *pdev,
 	scu_data.irq = pdev->irq;
 
 	scu = intel_scu_ipc_register(&pdev->dev, &scu_data);
-	if (IS_ERR(scu))
-		return PTR_ERR(scu);
-
-	if (setup_fn)
-		setup_fn();
-	return 0;
-}
-
-static void intel_mid_scu_setup(void)
-{
-	intel_scu_devices_create();
+	return PTR_ERR_OR_ZERO(scu);
 }
 
 static const struct pci_device_id pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x080e),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
-	{ PCI_VDEVICE(INTEL, 0x08ea),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
+	{ PCI_VDEVICE(INTEL, 0x080e) },
+	{ PCI_VDEVICE(INTEL, 0x08ea) },
 	{ PCI_VDEVICE(INTEL, 0x0a94) },
-	{ PCI_VDEVICE(INTEL, 0x11a0),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
+	{ PCI_VDEVICE(INTEL, 0x11a0) },
 	{ PCI_VDEVICE(INTEL, 0x1a94) },
 	{ PCI_VDEVICE(INTEL, 0x5a94) },
 	{}
diff --git a/drivers/sfi/Kconfig b/drivers/sfi/Kconfig
deleted file mode 100644
index 3d0b64db214e..000000000000
--- a/drivers/sfi/Kconfig
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# SFI Configuration
-#
-
-menuconfig SFI
-	bool "SFI (Simple Firmware Interface) Support"
-	help
-	The Simple Firmware Interface (SFI) provides a lightweight method
-	for platform firmware to pass information to the operating system
-	via static tables in memory.  Kernel SFI support is required to
-	boot on SFI-only platforms.  Currently, all SFI-only platforms are
-	based on the 2nd generation Intel Atom processor platform,
-	code-named Moorestown.
-
-	For more information, see http://simplefirmware.org
-
-	Say 'Y' here to enable the kernel to boot on SFI-only platforms.
diff --git a/drivers/sfi/Makefile b/drivers/sfi/Makefile
deleted file mode 100644
index ca9436b12386..000000000000
--- a/drivers/sfi/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-y	+= sfi_acpi.o
-obj-y	+= sfi_core.o
-
diff --git a/drivers/sfi/sfi_acpi.c b/drivers/sfi/sfi_acpi.c
deleted file mode 100644
index d277b36eb389..000000000000
--- a/drivers/sfi/sfi_acpi.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* sfi_acpi.c Simple Firmware Interface - ACPI extensions */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/sfi_acpi.h>
-#include "sfi_core.h"
-
-/*
- * SFI can access ACPI-defined tables via an optional ACPI XSDT.
- *
- * This allows re-use, and avoids re-definition, of standard tables.
- * For example, the "MCFG" table is defined by PCI, reserved by ACPI,
- * and is expected to be present many SFI-only systems.
- */
-
-static struct acpi_table_xsdt *xsdt_va __read_mostly;
-
-#define XSDT_GET_NUM_ENTRIES(ptable, entry_type) \
-	((ptable->header.length - sizeof(struct acpi_table_header)) / \
-	(sizeof(entry_type)))
-
-static inline struct sfi_table_header *acpi_to_sfi_th(
-				struct acpi_table_header *th)
-{
-	return (struct sfi_table_header *)th;
-}
-
-static inline struct acpi_table_header *sfi_to_acpi_th(
-				struct sfi_table_header *th)
-{
-	return (struct acpi_table_header *)th;
-}
-
-/*
- * sfi_acpi_parse_xsdt()
- *
- * Parse the ACPI XSDT for later access by sfi_acpi_table_parse().
- */
-static int __init sfi_acpi_parse_xsdt(struct sfi_table_header *th)
-{
-	struct sfi_table_key key = SFI_ANY_KEY;
-	int tbl_cnt, i;
-	void *ret;
-
-	xsdt_va = (struct acpi_table_xsdt *)th;
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(xsdt_va->table_offset_entry[i], &key);
-		if (IS_ERR(ret)) {
-			disable_sfi();
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-int __init sfi_acpi_init(void)
-{
-	struct sfi_table_key xsdt_key = { .sig = SFI_SIG_XSDT };
-
-	sfi_table_parse(SFI_SIG_XSDT, NULL, NULL, sfi_acpi_parse_xsdt);
-
-	/* Only call the get_table to keep the table mapped */
-	xsdt_va = (struct acpi_table_xsdt *)sfi_get_table(&xsdt_key);
-	return 0;
-}
-
-static struct acpi_table_header *sfi_acpi_get_table(struct sfi_table_key *key)
-{
-	u32 tbl_cnt, i;
-	void *ret;
-
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(xsdt_va->table_offset_entry[i], key);
-		if (!IS_ERR(ret) && ret)
-			return sfi_to_acpi_th(ret);
-	}
-
-	return NULL;
-}
-
-static void sfi_acpi_put_table(struct acpi_table_header *table)
-{
-	sfi_put_table(acpi_to_sfi_th(table));
-}
-
-/*
- * sfi_acpi_table_parse()
- *
- * Find specified table in XSDT, run handler on it and return its return value
- */
-int sfi_acpi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-			int(*handler)(struct acpi_table_header *))
-{
-	struct acpi_table_header *table = NULL;
-	struct sfi_table_key key;
-	int ret = 0;
-
-	if (sfi_disabled)
-		return -1;
-
-	key.sig = signature;
-	key.oem_id = oem_id;
-	key.oem_table_id = oem_table_id;
-
-	table = sfi_acpi_get_table(&key);
-	if (!table)
-		return -EINVAL;
-
-	ret = handler(table);
-	sfi_acpi_put_table(table);
-	return ret;
-}
-
-static ssize_t sfi_acpi_table_show(struct file *filp, struct kobject *kobj,
-			       struct bin_attribute *bin_attr, char *buf,
-			       loff_t offset, size_t count)
-{
-	struct sfi_table_attr *tbl_attr =
-	    container_of(bin_attr, struct sfi_table_attr, attr);
-	struct acpi_table_header *th = NULL;
-	struct sfi_table_key key;
-	ssize_t cnt;
-
-	key.sig = tbl_attr->name;
-	key.oem_id = NULL;
-	key.oem_table_id = NULL;
-
-	th = sfi_acpi_get_table(&key);
-	if (!th)
-		return 0;
-
-	cnt =  memory_read_from_buffer(buf, count, &offset,
-					th, th->length);
-	sfi_acpi_put_table(th);
-
-	return cnt;
-}
-
-
-void __init sfi_acpi_sysfs_init(void)
-{
-	u32 tbl_cnt, i;
-	struct sfi_table_attr *tbl_attr;
-
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		tbl_attr =
-			sfi_sysfs_install_table(xsdt_va->table_offset_entry[i]);
-		tbl_attr->attr.read = sfi_acpi_table_show;
-	}
-
-	return;
-}
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
deleted file mode 100644
index a5136901dd8a..000000000000
--- a/drivers/sfi/sfi_core.c
+++ /dev/null
@@ -1,522 +0,0 @@
-/* sfi_core.c Simple Firmware Interface - core internals */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/memblock.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-
-#include "sfi_core.h"
-
-#define ON_SAME_PAGE(addr1, addr2) \
-	(((unsigned long)(addr1) & PAGE_MASK) == \
-	((unsigned long)(addr2) & PAGE_MASK))
-#define TABLE_ON_PAGE(page, table, size) (ON_SAME_PAGE(page, table) && \
-				ON_SAME_PAGE(page, table + size))
-
-int sfi_disabled __read_mostly;
-EXPORT_SYMBOL(sfi_disabled);
-
-static u64 syst_pa __read_mostly;
-static struct sfi_table_simple *syst_va __read_mostly;
-
-/*
- * FW creates and saves the SFI tables in memory. When these tables get
- * used, they may need to be mapped to virtual address space, and the mapping
- * can happen before or after the memremap() is ready, so a flag is needed
- * to indicating this
- */
-static u32 sfi_use_memremap __read_mostly;
-
-/*
- * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
- * and introduces section mismatch. So use __ref to make it calm.
- */
-static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
-{
-	if (!phys || !size)
-		return NULL;
-
-	if (sfi_use_memremap)
-		return memremap(phys, size, MEMREMAP_WB);
-	else
-		return early_memremap(phys, size);
-}
-
-static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
-{
-	if (!virt || !size)
-		return;
-
-	if (sfi_use_memremap)
-		memunmap(virt);
-	else
-		early_memunmap(virt, size);
-}
-
-static void sfi_print_table_header(unsigned long long pa,
-				struct sfi_table_header *header)
-{
-	pr_info("%4.4s %llX, %04X (v%d %6.6s %8.8s)\n",
-		header->sig, pa,
-		header->len, header->rev, header->oem_id,
-		header->oem_table_id);
-}
-
-/*
- * sfi_verify_table()
- * Sanity check table lengh, calculate checksum
- */
-static int sfi_verify_table(struct sfi_table_header *table)
-{
-
-	u8 checksum = 0;
-	u8 *puchar = (u8 *)table;
-	u32 length = table->len;
-
-	/* Sanity check table length against arbitrary 1MB limit */
-	if (length > 0x100000) {
-		pr_err("Invalid table length 0x%x\n", length);
-		return -1;
-	}
-
-	while (length--)
-		checksum += *puchar++;
-
-	if (checksum) {
-		pr_err("Checksum %2.2X should be %2.2X\n",
-			table->csum, table->csum - checksum);
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * sfi_map_table()
- *
- * Return address of mapped table
- * Check for common case that we can re-use mapping to SYST,
- * which requires syst_pa, syst_va to be initialized.
- */
-static struct sfi_table_header *sfi_map_table(u64 pa)
-{
-	struct sfi_table_header *th;
-	u32 length;
-
-	if (!TABLE_ON_PAGE(syst_pa, pa, sizeof(struct sfi_table_header)))
-		th = sfi_map_memory(pa, sizeof(struct sfi_table_header));
-	else
-		th = (void *)syst_va + (pa - syst_pa);
-
-	 /* If table fits on same page as its header, we are done */
-	if (TABLE_ON_PAGE(th, th, th->len))
-		return th;
-
-	/* Entire table does not fit on same page as SYST */
-	length = th->len;
-	if (!TABLE_ON_PAGE(syst_pa, pa, sizeof(struct sfi_table_header)))
-		sfi_unmap_memory(th, sizeof(struct sfi_table_header));
-
-	return sfi_map_memory(pa, length);
-}
-
-/*
- * sfi_unmap_table()
- *
- * Undoes effect of sfi_map_table() by unmapping table
- * if it did not completely fit on same page as SYST.
- */
-static void sfi_unmap_table(struct sfi_table_header *th)
-{
-	if (!TABLE_ON_PAGE(syst_va, th, th->len))
-		sfi_unmap_memory(th, TABLE_ON_PAGE(th, th, th->len) ?
-					sizeof(*th) : th->len);
-}
-
-static int sfi_table_check_key(struct sfi_table_header *th,
-				struct sfi_table_key *key)
-{
-
-	if (strncmp(th->sig, key->sig, SFI_SIGNATURE_SIZE)
-		|| (key->oem_id && strncmp(th->oem_id,
-				key->oem_id, SFI_OEM_ID_SIZE))
-		|| (key->oem_table_id && strncmp(th->oem_table_id,
-				key->oem_table_id, SFI_OEM_TABLE_ID_SIZE)))
-		return -1;
-
-	return 0;
-}
-
-/*
- * This function will be used in 2 cases:
- * 1. used to enumerate and verify the tables addressed by SYST/XSDT,
- *    thus no signature will be given (in kernel boot phase)
- * 2. used to parse one specific table, signature must exist, and
- *    the mapped virt address will be returned, and the virt space
- *    will be released by call sfi_put_table() later
- *
- * This two cases are from two different functions with two different
- * sections and causes section mismatch warning. So use __ref to tell
- * modpost not to make any noise.
- *
- * Return value:
- *	NULL:			when can't find a table matching the key
- *	ERR_PTR(error):		error value
- *	virt table address:	when a matched table is found
- */
-struct sfi_table_header *
- __ref sfi_check_table(u64 pa, struct sfi_table_key *key)
-{
-	struct sfi_table_header *th;
-	void *ret = NULL;
-
-	th = sfi_map_table(pa);
-	if (!th)
-		return ERR_PTR(-ENOMEM);
-
-	if (!key->sig) {
-		sfi_print_table_header(pa, th);
-		if (sfi_verify_table(th))
-			ret = ERR_PTR(-EINVAL);
-	} else {
-		if (!sfi_table_check_key(th, key))
-			return th;	/* Success */
-	}
-
-	sfi_unmap_table(th);
-	return ret;
-}
-
-/*
- * sfi_get_table()
- *
- * Search SYST for the specified table with the signature in
- * the key, and return the mapped table
- */
-struct sfi_table_header *sfi_get_table(struct sfi_table_key *key)
-{
-	struct sfi_table_header *th;
-	u32 tbl_cnt, i;
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		th = sfi_check_table(syst_va->pentry[i], key);
-		if (!IS_ERR(th) && th)
-			return th;
-	}
-
-	return NULL;
-}
-
-void sfi_put_table(struct sfi_table_header *th)
-{
-	sfi_unmap_table(th);
-}
-
-/* Find table with signature, run handler on it */
-int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-			sfi_table_handler handler)
-{
-	struct sfi_table_header *table = NULL;
-	struct sfi_table_key key;
-	int ret = -EINVAL;
-
-	if (sfi_disabled || !handler || !signature)
-		goto exit;
-
-	key.sig = signature;
-	key.oem_id = oem_id;
-	key.oem_table_id = oem_table_id;
-
-	table = sfi_get_table(&key);
-	if (!table)
-		goto exit;
-
-	ret = handler(table);
-	sfi_put_table(table);
-exit:
-	return ret;
-}
-EXPORT_SYMBOL_GPL(sfi_table_parse);
-
-/*
- * sfi_parse_syst()
- * Checksum all the tables in SYST and print their headers
- *
- * success: set syst_va, return 0
- */
-static int __init sfi_parse_syst(void)
-{
-	struct sfi_table_key key = SFI_ANY_KEY;
-	int tbl_cnt, i;
-	void *ret;
-
-	syst_va = sfi_map_memory(syst_pa, sizeof(struct sfi_table_simple));
-	if (!syst_va)
-		return -ENOMEM;
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(syst_va->pentry[i], &key);
-		if (IS_ERR(ret))
-			return PTR_ERR(ret);
-	}
-
-	return 0;
-}
-
-/*
- * The OS finds the System Table by searching 16-byte boundaries between
- * physical address 0x000E0000 and 0x000FFFFF. The OS shall search this region
- * starting at the low address and shall stop searching when the 1st valid SFI
- * System Table is found.
- *
- * success: set syst_pa, return 0
- * fail: return -1
- */
-static __init int sfi_find_syst(void)
-{
-	unsigned long offset, len;
-	void *start;
-
-	len = SFI_SYST_SEARCH_END - SFI_SYST_SEARCH_BEGIN;
-	start = sfi_map_memory(SFI_SYST_SEARCH_BEGIN, len);
-	if (!start)
-		return -1;
-
-	for (offset = 0; offset < len; offset += 16) {
-		struct sfi_table_header *syst_hdr;
-
-		syst_hdr = start + offset;
-		if (strncmp(syst_hdr->sig, SFI_SIG_SYST,
-				SFI_SIGNATURE_SIZE))
-			continue;
-
-		if (syst_hdr->len > PAGE_SIZE)
-			continue;
-
-		sfi_print_table_header(SFI_SYST_SEARCH_BEGIN + offset,
-					syst_hdr);
-
-		if (sfi_verify_table(syst_hdr))
-			continue;
-
-		/*
-		 * Enforce SFI spec mandate that SYST reside within a page.
-		 */
-		if (!ON_SAME_PAGE(syst_pa, syst_pa + syst_hdr->len)) {
-			pr_info("SYST 0x%llx + 0x%x crosses page\n",
-					syst_pa, syst_hdr->len);
-			continue;
-		}
-
-		/* Success */
-		syst_pa = SFI_SYST_SEARCH_BEGIN + offset;
-		sfi_unmap_memory(start, len);
-		return 0;
-	}
-
-	sfi_unmap_memory(start, len);
-	return -1;
-}
-
-static struct kobject *sfi_kobj;
-static struct kobject *tables_kobj;
-
-static ssize_t sfi_table_show(struct file *filp, struct kobject *kobj,
-			       struct bin_attribute *bin_attr, char *buf,
-			       loff_t offset, size_t count)
-{
-	struct sfi_table_attr *tbl_attr =
-	    container_of(bin_attr, struct sfi_table_attr, attr);
-	struct sfi_table_header *th = NULL;
-	struct sfi_table_key key;
-	ssize_t cnt;
-
-	key.sig = tbl_attr->name;
-	key.oem_id = NULL;
-	key.oem_table_id = NULL;
-
-	if (strncmp(SFI_SIG_SYST, tbl_attr->name, SFI_SIGNATURE_SIZE)) {
-		th = sfi_get_table(&key);
-		if (!th)
-			return 0;
-
-		cnt =  memory_read_from_buffer(buf, count, &offset,
-						th, th->len);
-		sfi_put_table(th);
-	} else
-		cnt =  memory_read_from_buffer(buf, count, &offset,
-					syst_va, syst_va->header.len);
-
-	return cnt;
-}
-
-struct sfi_table_attr __init *sfi_sysfs_install_table(u64 pa)
-{
-	struct sfi_table_attr *tbl_attr;
-	struct sfi_table_header *th;
-	int ret;
-
-	tbl_attr = kzalloc(sizeof(struct sfi_table_attr), GFP_KERNEL);
-	if (!tbl_attr)
-		return NULL;
-
-	th = sfi_map_table(pa);
-	if (!th || !th->sig[0]) {
-		kfree(tbl_attr);
-		return NULL;
-	}
-
-	sysfs_attr_init(&tbl_attr->attr.attr);
-	memcpy(tbl_attr->name, th->sig, SFI_SIGNATURE_SIZE);
-
-	tbl_attr->attr.size = 0;
-	tbl_attr->attr.read = sfi_table_show;
-	tbl_attr->attr.attr.name = tbl_attr->name;
-	tbl_attr->attr.attr.mode = 0400;
-
-	ret = sysfs_create_bin_file(tables_kobj,
-				  &tbl_attr->attr);
-	if (ret) {
-		kfree(tbl_attr);
-		tbl_attr = NULL;
-	}
-
-	sfi_unmap_table(th);
-	return tbl_attr;
-}
-
-static int __init sfi_sysfs_init(void)
-{
-	int tbl_cnt, i;
-
-	if (sfi_disabled)
-		return 0;
-
-	sfi_kobj = kobject_create_and_add("sfi", firmware_kobj);
-	if (!sfi_kobj)
-		return 0;
-
-	tables_kobj = kobject_create_and_add("tables", sfi_kobj);
-	if (!tables_kobj) {
-		kobject_put(sfi_kobj);
-		return 0;
-	}
-
-	sfi_sysfs_install_table(syst_pa);
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-
-	for (i = 0; i < tbl_cnt; i++)
-		sfi_sysfs_install_table(syst_va->pentry[i]);
-
-	sfi_acpi_sysfs_init();
-	kobject_uevent(sfi_kobj, KOBJ_ADD);
-	kobject_uevent(tables_kobj, KOBJ_ADD);
-	pr_info("SFI sysfs interfaces init success\n");
-	return 0;
-}
-
-void __init sfi_init(void)
-{
-	if (!acpi_disabled)
-		disable_sfi();
-
-	if (sfi_disabled)
-		return;
-
-	pr_info("Simple Firmware Interface v0.81 http://simplefirmware.org\n");
-
-	if (sfi_find_syst() || sfi_parse_syst() || sfi_platform_init())
-		disable_sfi();
-
-	return;
-}
-
-void __init sfi_init_late(void)
-{
-	int length;
-
-	if (sfi_disabled)
-		return;
-
-	length = syst_va->header.len;
-	sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
-
-	/* Use memremap now after it is ready */
-	sfi_use_memremap = 1;
-	syst_va = sfi_map_memory(syst_pa, length);
-
-	sfi_acpi_init();
-}
-
-/*
- * The reason we put it here because we need wait till the /sys/firmware
- * is setup, then our interface can be registered in /sys/firmware/sfi
- */
-core_initcall(sfi_sysfs_init);
diff --git a/drivers/sfi/sfi_core.h b/drivers/sfi/sfi_core.h
deleted file mode 100644
index 1d5cfe854cf7..000000000000
--- a/drivers/sfi/sfi_core.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* sfi_core.h Simple Firmware Interface, internal header */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <linux/sysfs.h>
-
-struct sfi_table_key{
-	char	*sig;
-	char	*oem_id;
-	char	*oem_table_id;
-};
-
-/* sysfs interface */
-struct sfi_table_attr {
-	struct bin_attribute attr;
-	char name[8];
-};
-
-#define SFI_ANY_KEY { .sig = NULL, .oem_id = NULL, .oem_table_id = NULL }
-
-extern int __init sfi_acpi_init(void);
-extern  struct sfi_table_header *sfi_check_table(u64 paddr,
-					struct sfi_table_key *key);
-struct sfi_table_header *sfi_get_table(struct sfi_table_key *key);
-extern void sfi_put_table(struct sfi_table_header *table);
-extern struct sfi_table_attr __init *sfi_sysfs_install_table(u64 pa);
-extern void __init sfi_acpi_sysfs_init(void);
diff --git a/include/linux/sfi.h b/include/linux/sfi.h
deleted file mode 100644
index e0e1597ef9e6..000000000000
--- a/include/linux/sfi.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* sfi.h Simple Firmware Interface */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef _LINUX_SFI_H
-#define _LINUX_SFI_H
-
-#include <linux/init.h>
-#include <linux/types.h>
-
-/* Table signatures reserved by the SFI specification */
-#define SFI_SIG_SYST		"SYST"
-#define SFI_SIG_FREQ		"FREQ"
-#define SFI_SIG_IDLE		"IDLE"
-#define SFI_SIG_CPUS		"CPUS"
-#define SFI_SIG_MTMR		"MTMR"
-#define SFI_SIG_MRTC		"MRTC"
-#define SFI_SIG_MMAP		"MMAP"
-#define SFI_SIG_APIC		"APIC"
-#define SFI_SIG_XSDT		"XSDT"
-#define SFI_SIG_WAKE		"WAKE"
-#define SFI_SIG_DEVS		"DEVS"
-#define SFI_SIG_GPIO		"GPIO"
-
-#define SFI_SIGNATURE_SIZE	4
-#define SFI_OEM_ID_SIZE		6
-#define SFI_OEM_TABLE_ID_SIZE	8
-
-#define SFI_NAME_LEN		16
-
-#define SFI_SYST_SEARCH_BEGIN		0x000E0000
-#define SFI_SYST_SEARCH_END		0x000FFFFF
-
-#define SFI_GET_NUM_ENTRIES(ptable, entry_type) \
-	((ptable->header.len - sizeof(struct sfi_table_header)) / \
-	(sizeof(entry_type)))
-/*
- * Table structures must be byte-packed to match the SFI specification,
- * as they are provided by the BIOS.
- */
-struct sfi_table_header {
-	char	sig[SFI_SIGNATURE_SIZE];
-	u32	len;
-	u8	rev;
-	u8	csum;
-	char	oem_id[SFI_OEM_ID_SIZE];
-	char	oem_table_id[SFI_OEM_TABLE_ID_SIZE];
-} __packed;
-
-struct sfi_table_simple {
-	struct sfi_table_header		header;
-	u64				pentry[1];
-} __packed;
-
-/* Comply with UEFI spec 2.1 */
-struct sfi_mem_entry {
-	u32	type;
-	u64	phys_start;
-	u64	virt_start;
-	u64	pages;
-	u64	attrib;
-} __packed;
-
-struct sfi_cpu_table_entry {
-	u32	apic_id;
-} __packed;
-
-struct sfi_cstate_table_entry {
-	u32	hint;		/* MWAIT hint */
-	u32	latency;	/* latency in ms */
-} __packed;
-
-struct sfi_apic_table_entry {
-	u64	phys_addr;	/* phy base addr for APIC reg */
-} __packed;
-
-struct sfi_freq_table_entry {
-	u32	freq_mhz;	/* in MHZ */
-	u32	latency;	/* transition latency in ms */
-	u32	ctrl_val;	/* value to write to PERF_CTL */
-} __packed;
-
-struct sfi_wake_table_entry {
-	u64	phys_addr;	/* pointer to where the wake vector locates */
-} __packed;
-
-struct sfi_timer_table_entry {
-	u64	phys_addr;	/* phy base addr for the timer */
-	u32	freq_hz;	/* in HZ */
-	u32	irq;
-} __packed;
-
-struct sfi_rtc_table_entry {
-	u64	phys_addr;	/* phy base addr for the RTC */
-	u32	irq;
-} __packed;
-
-struct sfi_device_table_entry {
-	u8	type;		/* bus type, I2C, SPI or ...*/
-#define SFI_DEV_TYPE_SPI	0
-#define SFI_DEV_TYPE_I2C	1
-#define SFI_DEV_TYPE_UART	2
-#define SFI_DEV_TYPE_HSI	3
-#define SFI_DEV_TYPE_IPC	4
-#define SFI_DEV_TYPE_SD		5
-
-	u8	host_num;	/* attached to host 0, 1...*/
-	u16	addr;
-	u8	irq;
-	u32	max_freq;
-	char	name[SFI_NAME_LEN];
-} __packed;
-
-struct sfi_gpio_table_entry {
-	char	controller_name[SFI_NAME_LEN];
-	u16	pin_no;
-	char	pin_name[SFI_NAME_LEN];
-} __packed;
-
-typedef int (*sfi_table_handler) (struct sfi_table_header *table);
-
-#ifdef CONFIG_SFI
-extern void __init sfi_init(void);
-extern int __init sfi_platform_init(void);
-extern void __init sfi_init_late(void);
-extern int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-				sfi_table_handler handler);
-
-extern int sfi_disabled;
-static inline void disable_sfi(void)
-{
-	sfi_disabled = 1;
-}
-
-#else /* !CONFIG_SFI */
-
-static inline void sfi_init(void)
-{
-}
-
-static inline void sfi_init_late(void)
-{
-}
-
-#define sfi_disabled	0
-
-static inline int sfi_table_parse(char *signature, char *oem_id,
-					char *oem_table_id,
-					sfi_table_handler handler)
-{
-	return -1;
-}
-
-#endif /* !CONFIG_SFI */
-
-#endif /*_LINUX_SFI_H*/
diff --git a/include/linux/sfi_acpi.h b/include/linux/sfi_acpi.h
deleted file mode 100644
index a6e555cbe05c..000000000000
--- a/include/linux/sfi_acpi.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* sfi.h Simple Firmware Interface */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef _LINUX_SFI_ACPI_H
-#define _LINUX_SFI_ACPI_H
-
-#include <linux/acpi.h>
-#include <linux/sfi.h>
-
-#ifdef CONFIG_SFI
-extern int sfi_acpi_table_parse(char *signature, char *oem_id,
-				char *oem_table_id,
-				int (*handler)(struct acpi_table_header *));
-
-static inline int __init acpi_sfi_table_parse(char *signature,
-				int (*handler)(struct acpi_table_header *))
-{
-	if (!acpi_table_parse(signature, handler))
-		return 0;
-
-	return sfi_acpi_table_parse(signature, NULL, NULL, handler);
-}
-#else /* !CONFIG_SFI */
-static inline int sfi_acpi_table_parse(char *signature, char *oem_id,
-				char *oem_table_id,
-				int (*handler)(struct acpi_table_header *))
-{
-	return -1;
-}
-
-static inline int __init acpi_sfi_table_parse(char *signature,
-				int (*handler)(struct acpi_table_header *))
-{
-	return acpi_table_parse(signature, handler);
-}
-#endif /* !CONFIG_SFI */
-
-#endif /*_LINUX_SFI_ACPI_H*/
diff --git a/init/main.c b/init/main.c
index a626e78dbf06..e9933cbf60d4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -74,7 +74,6 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
@@ -1054,7 +1053,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 
 	acpi_subsystem_init();
 	arch_post_acpi_subsys_init();
-	sfi_init_late();
 	kcsan_init();
 
 	/* Do the rest non-__init'ed, we're now alive */
-- 
cgit v1.2.3


From aec6c60a01d3a3170242d6a99372a388e1136dc6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 16 Jan 2021 08:35:42 +0900
Subject: kbuild: check the minimum compiler version in Kconfig

Paul Gortmaker reported a regression in the GCC version check. [1]
If you use GCC 4.8, the build breaks before showing the error message
"error Sorry, your version of GCC is too old - please use 4.9 or newer."

I do not want to apply his fix-up since it implies we would not be able
to remove any cc-option test. Anyway, I admit checking the GCC version
in <linux/compiler-gcc.h> is too late.

Almost at the same time, Linus also suggested to move the compiler
version error to Kconfig time. [2]

I unified the two similar scripts, gcc-version.sh and clang-version.sh
into cc-version.sh. The old scripts invoked the compiler multiple times
(3 times for gcc-version.sh, 4 times for clang-version.sh). I refactored
the code so the new one invokes the compiler just once, and also tried
my best to use shell-builtin commands where possible.

The new script runs faster.

  $ time ./scripts/clang-version.sh clang
  120000

  real    0m0.029s
  user    0m0.012s
  sys     0m0.021s

  $ time ./scripts/cc-version.sh clang
  Clang 120000

  real    0m0.009s
  user    0m0.006s
  sys     0m0.004s

cc-version.sh also shows an error message if the compiler is too old:

  $ make defconfig CC=clang-9
  *** Default configuration is based on 'x86_64_defconfig'
  ***
  *** Compiler is too old.
  ***   Your Clang version:    9.0.1
  ***   Minimum Clang version: 10.0.1
  ***
  scripts/Kconfig.include:46: Sorry, this compiler is not supported.
  make[1]: *** [scripts/kconfig/Makefile:81: defconfig] Error 1
  make: *** [Makefile:602: defconfig] Error 2

The new script takes care of ICC because we have <linux/compiler-intel.h>
although I am not sure if building the kernel with ICC is well-supported.

[1]: https://lore.kernel.org/r/20210110190807.134996-1-paul.gortmaker@windriver.com
[2]: https://lore.kernel.org/r/CAHk-=wh-+TMHPTFo1qs-MYyK7tZh-OQovA=pP3=e06aCVp6_kA@mail.gmail.com

Fixes: 87de84c9140e ("kbuild: remove cc-option test of -Werror=date-time")
Reported-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Tested-by: Miguel Ojeda <ojeda@kernel.org>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 MAINTAINERS                    |  1 -
 include/linux/compiler-clang.h | 10 ------
 include/linux/compiler-gcc.h   | 11 ------
 init/Kconfig                   |  9 ++---
 scripts/Kconfig.include        |  6 ++++
 scripts/cc-version.sh          | 82 ++++++++++++++++++++++++++++++++++++++++++
 scripts/clang-version.sh       | 19 ----------
 scripts/gcc-version.sh         | 20 -----------
 8 files changed, 93 insertions(+), 65 deletions(-)
 create mode 100755 scripts/cc-version.sh
 delete mode 100755 scripts/clang-version.sh
 delete mode 100755 scripts/gcc-version.sh

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 667d03852191..df820969be1f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4314,7 +4314,6 @@ C:	irc://chat.freenode.net/clangbuiltlinux
 F:	Documentation/kbuild/llvm.rst
 F:	include/linux/compiler-clang.h
 F:	scripts/clang-tools/
-F:	scripts/clang-version.sh
 F:	scripts/lld-version.sh
 K:	\b(?i:clang|llvm)\b
 
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 98cff1b4b088..04c0a5a717f7 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -3,16 +3,6 @@
 #error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
 #endif
 
-#define CLANG_VERSION (__clang_major__ * 10000	\
-		     + __clang_minor__ * 100	\
-		     + __clang_patchlevel__)
-
-#if CLANG_VERSION < 100001
-#ifndef __BPF_TRACING__
-# error Sorry, your version of Clang is too old - please use 10.0.1 or newer.
-#endif
-#endif
-
 /* Compiler specific definitions for Clang compiler */
 
 /* same as gcc, this was present in clang-2.6 so we can assume it works
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 555ab0fddbef..48750243db4c 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -10,17 +10,6 @@
 		     + __GNUC_MINOR__ * 100	\
 		     + __GNUC_PATCHLEVEL__)
 
-/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */
-#if GCC_VERSION < 40900
-# error Sorry, your version of GCC is too old - please use 4.9 or newer.
-#elif defined(CONFIG_ARM64) && GCC_VERSION < 50100
-/*
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293
- * https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk
- */
-# error Sorry, your version of GCC is too old - please use 5.1 or newer.
-#endif
-
 /*
  * This macro obfuscates arithmetic on a variable address so that gcc
  * shouldn't recognize the original var, and make assumptions about it.
diff --git a/init/Kconfig b/init/Kconfig
index 29ad68325028..7bcfa24524c2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,11 +26,11 @@ config CC_VERSION_TEXT
 	    and then every file will be rebuilt.
 
 config CC_IS_GCC
-	def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc)
+	def_bool $(success,test "$(cc-name)" = GCC)
 
 config GCC_VERSION
 	int
-	default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC
+	default $(cc-version) if CC_IS_GCC
 	default 0
 
 config LD_VERSION
@@ -38,14 +38,15 @@ config LD_VERSION
 	default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh)
 
 config CC_IS_CLANG
-	def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q clang)
+	def_bool $(success,test "$(cc-name)" = Clang)
 
 config LD_IS_LLD
 	def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD)
 
 config CLANG_VERSION
 	int
-	default $(shell,$(srctree)/scripts/clang-version.sh $(CC))
+	default $(cc-version) if CC_IS_CLANG
+	default 0
 
 config LLD_VERSION
 	int
diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
index a5fe72c504ff..0228cb9c74aa 100644
--- a/scripts/Kconfig.include
+++ b/scripts/Kconfig.include
@@ -39,6 +39,12 @@ as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler
 $(error-if,$(failure,command -v $(CC)),compiler '$(CC)' not found)
 $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found)
 
+# Get the compiler name, version, and error out if it is not supported.
+cc-info := $(shell,$(srctree)/scripts/cc-version.sh $(CC))
+$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not supported.)
+cc-name := $(shell,set -- $(cc-info) && echo $1)
+cc-version := $(shell,set -- $(cc-info) && echo $2)
+
 # Fail if the linker is gold as it's not capable of linking the kernel proper
 $(error-if,$(success, $(LD) -v | grep -q gold), gold linker '$(LD)' not supported)
 
diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh
new file mode 100755
index 000000000000..3f2ee885b116
--- /dev/null
+++ b/scripts/cc-version.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Print the compiler name and its version in a 5 or 6-digit form.
+# Also, perform the minimum version check.
+
+set -e
+
+# When you raise the minimum compiler version, please update
+# Documentation/process/changes.rst as well.
+gcc_min_version=4.9.0
+clang_min_version=10.0.1
+icc_min_version=16.0.3 # temporary
+
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293
+# https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk
+if [ "$SRCARCH" = arm64 ]; then
+	gcc_min_version=5.1.0
+fi
+
+# Print the compiler name and some version components.
+get_compiler_info()
+{
+	cat <<- EOF | "$@" -E -P -x c - 2>/dev/null
+	#if defined(__clang__)
+	Clang	__clang_major__  __clang_minor__  __clang_patchlevel__
+	#elif defined(__INTEL_COMPILER)
+	ICC	__INTEL_COMPILER  __INTEL_COMPILER_UPDATE
+	#elif defined(__GNUC__)
+	GCC	__GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+	#else
+	unknown
+	#endif
+	EOF
+}
+
+# Convert the version string x.y.z to a canonical 5 or 6-digit form.
+get_canonical_version()
+{
+	IFS=.
+	set -- $1
+	echo $((10000 * $1 + 100 * $2 + $3))
+}
+
+# $@ instead of $1 because multiple words might be given, e.g. CC="ccache gcc".
+orig_args="$@"
+set -- $(get_compiler_info "$@")
+
+name=$1
+
+case "$name" in
+GCC)
+	version=$2.$3.$4
+	min_version=$gcc_min_version
+	;;
+Clang)
+	version=$2.$3.$4
+	min_version=$clang_min_version
+	;;
+ICC)
+	version=$(($2 / 100)).$(($2 % 100)).$3
+	min_version=$icc_min_version
+	;;
+*)
+	echo "$orig_args: unknown compiler" >&2
+	exit 1
+	;;
+esac
+
+cversion=$(get_canonical_version $version)
+min_cversion=$(get_canonical_version $min_version)
+
+if [ "$cversion" -lt "$min_cversion" ]; then
+	echo >&2 "***"
+	echo >&2 "*** Compiler is too old."
+	echo >&2 "***   Your $name version:    $version"
+	echo >&2 "***   Minimum $name version: $min_version"
+	echo >&2 "***"
+	exit 1
+fi
+
+echo $name $cversion
diff --git a/scripts/clang-version.sh b/scripts/clang-version.sh
deleted file mode 100755
index 6fabf0695761..000000000000
--- a/scripts/clang-version.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# clang-version clang-command
-#
-# Print the compiler version of `clang-command' in a 5 or 6-digit form
-# such as `50001' for clang-5.0.1 etc.
-
-compiler="$*"
-
-if ! ( $compiler --version | grep -q clang) ; then
-	echo 0
-	exit 1
-fi
-
-MAJOR=$(echo __clang_major__ | $compiler -E -x c - | tail -n 1)
-MINOR=$(echo __clang_minor__ | $compiler -E -x c - | tail -n 1)
-PATCHLEVEL=$(echo __clang_patchlevel__ | $compiler -E -x c - | tail -n 1)
-printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL
diff --git a/scripts/gcc-version.sh b/scripts/gcc-version.sh
deleted file mode 100755
index ae353432539b..000000000000
--- a/scripts/gcc-version.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# gcc-version gcc-command
-#
-# Print the gcc version of `gcc-command' in a 5 or 6-digit form
-# such as `29503' for gcc-2.95.3, `30301' for gcc-3.3.1, etc.
-
-compiler="$*"
-
-if [ ${#compiler} -eq 0 ]; then
-	echo "Error: No compiler specified." >&2
-	printf "Usage:\n\t$0 <gcc-command>\n" >&2
-	exit 1
-fi
-
-MAJOR=$(echo __GNUC__ | $compiler -E -x c - | tail -n 1)
-MINOR=$(echo __GNUC_MINOR__ | $compiler -E -x c - | tail -n 1)
-PATCHLEVEL=$(echo __GNUC_PATCHLEVEL__ | $compiler -E -x c - | tail -n 1)
-printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL
-- 
cgit v1.2.3


From 3c4fa46b30c551b1df2fb1574a684f68bc22067c Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 5 Feb 2021 12:22:18 -0800
Subject: vmlinux.lds.h: add DWARF v5 sections

We expect toolchains to produce these new debug info sections as part of
DWARF v5. Add explicit placements to prevent the linker warnings from
--orphan-section=warn.

Compilers may produce such sections with explicit -gdwarf-5, or based on
the implicit default version of DWARF when -g is used via DEBUG_INFO.
This implicit default changes over time, and has changed to DWARF v5
with GCC 11.

.debug_sup was mentioned in review, but without compilers producing it
today, let's wait to add it until it becomes necessary.

Cc: stable@vger.kernel.org
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1922707
Reported-by: Chris Murphy <lists@colorremedies.com>
Suggested-by: Fangrui Song <maskray@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Mark Wielaard <mark@klomp.org>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..b3d4ceaf19ed 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -828,8 +828,13 @@
 		/* DWARF 4 */						\
 		.debug_types	0 : { *(.debug_types) }			\
 		/* DWARF 5 */						\
+		.debug_addr	0 : { *(.debug_addr) }			\
+		.debug_line_str	0 : { *(.debug_line_str) }		\
+		.debug_loclists	0 : { *(.debug_loclists) }		\
 		.debug_macro	0 : { *(.debug_macro) }			\
-		.debug_addr	0 : { *(.debug_addr) }
+		.debug_names	0 : { *(.debug_names) }			\
+		.debug_rnglists	0 : { *(.debug_rnglists) }		\
+		.debug_str_offsets	0 : { *(.debug_str_offsets) }
 
 /* Stabs debugging sections. */
 #define STABS_DEBUG							\
-- 
cgit v1.2.3


From 88a686728b3739d3598851e729c0e81f194e5c53 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Fri, 12 Feb 2021 11:29:24 -0500
Subject: kbuild: simplify access to the kernel's version

Instead of storing the version in a single integer and having various
kernel (and userspace) code how it's constructed, export individual
(major, patchlevel, sublevel) components and simplify kernel code that
uses it.

This should also make it easier on userspace.

Signed-off-by: Sasha Levin <sashal@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Makefile                                       | 5 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++--
 drivers/usb/core/hcd.c                         | 4 ++--
 drivers/usb/gadget/udc/aspeed-vhub/hub.c       | 4 ++--
 include/linux/usb/composite.h                  | 4 ++--
 kernel/sys.c                                   | 2 +-
 6 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Makefile b/Makefile
index 12607d389148..1fdd44fe1659 100644
--- a/Makefile
+++ b/Makefile
@@ -1255,7 +1255,10 @@ define filechk_version.h
 		expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \
 	fi;                                                              \
 	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) +  \
-	((c) > 255 ? 255 : (c)))'
+	((c) > 255 ? 255 : (c)))';                                       \
+	echo \#define LINUX_VERSION_MAJOR $(VERSION);                    \
+	echo \#define LINUX_VERSION_PATCHLEVEL $(PATCHLEVEL);            \
+	echo \#define LINUX_VERSION_SUBLEVEL $(SUBLEVEL)
 endef
 
 $(version_h): FORCE
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ca6f2fc39ea0..29f886263dc5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -235,8 +235,8 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev)
 	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
 
 	snprintf(string + strlen(string), remaining_size, "%u.%u.%u",
-		 (u8)((LINUX_VERSION_CODE >> 16) & 0xff), (u8)((LINUX_VERSION_CODE >> 8) & 0xff),
-		 (u16)(LINUX_VERSION_CODE & 0xffff));
+		LINUX_VERSION_MAJOR, LINUX_VERSION_PATCHLEVEL,
+		LINUX_VERSION_SUBLEVEL);
 
 	/*Send the command*/
 	MLX5_SET(set_driver_version_in, in, opcode,
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index ad5a0f405a75..3f0381344221 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -111,8 +111,8 @@ DECLARE_WAIT_QUEUE_HEAD(usb_kill_urb_queue);
  */
 
 /*-------------------------------------------------------------------------*/
-#define KERNEL_REL	bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff))
-#define KERNEL_VER	bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff))
+#define KERNEL_REL	bin2bcd(LINUX_VERSION_MAJOR)
+#define KERNEL_VER	bin2bcd(LINUX_VERSION_PATCHLEVEL)
 
 /* usb 3.1 root hub device descriptor */
 static const u8 usb31_rh_dev_descriptor[18] = {
diff --git a/drivers/usb/gadget/udc/aspeed-vhub/hub.c b/drivers/usb/gadget/udc/aspeed-vhub/hub.c
index bfd8e77788e2..5c7dea5e0ff1 100644
--- a/drivers/usb/gadget/udc/aspeed-vhub/hub.c
+++ b/drivers/usb/gadget/udc/aspeed-vhub/hub.c
@@ -46,8 +46,8 @@
  *    - Make vid/did overridable
  *    - make it look like usb1 if usb1 mode forced
  */
-#define KERNEL_REL	bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff))
-#define KERNEL_VER	bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff))
+#define KERNEL_REL	bin2bcd(LINUX_VERSION_MAJOR)
+#define KERNEL_VER	bin2bcd(LINUX_VERSION_PATCHLEVEL)
 
 enum {
 	AST_VHUB_STR_INDEX_MAX = 4,
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index a2d229ab63ba..7531ce723374 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -573,8 +573,8 @@ static inline u16 get_default_bcdDevice(void)
 {
 	u16 bcdDevice;
 
-	bcdDevice = bin2bcd((LINUX_VERSION_CODE >> 16 & 0xff)) << 8;
-	bcdDevice |= bin2bcd((LINUX_VERSION_CODE >> 8 & 0xff));
+	bcdDevice = bin2bcd(LINUX_VERSION_MAJOR) << 8;
+	bcdDevice |= bin2bcd(LINUX_VERSION_PATCHLEVEL);
 	return bcdDevice;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..c2225bd405d5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1243,7 +1243,7 @@ static int override_release(char __user *release, size_t len)
 				break;
 			rest++;
 		}
-		v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+		v = LINUX_VERSION_PATCHLEVEL + 60;
 		copy = clamp_t(size_t, len, 1, sizeof(buf));
 		copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
 		ret = copy_to_user(release, buf, copy + 1);
-- 
cgit v1.2.3


From 75cfb200cd081d23eb7eaa68deba9e0ab9320070 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 12 Feb 2021 16:49:47 -0500
Subject: NFS: 'flags' field should be unsigned in struct nfs_server

The mount flags are all unsigned integers, so we should not be storing
them in a signed field.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs_sb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 38e60ec742df..962e8313f007 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -142,7 +142,7 @@ struct nfs_server {
 	struct nlm_host		*nlm_host;	/* NLM client handle */
 	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
 	atomic_long_t		writeback;	/* number of writeback pages */
-	int			flags;		/* various flags */
+	unsigned int		flags;		/* various flags */
 
 /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
-- 
cgit v1.2.3


From ed7bcdb374d20fab9e9dc36853a6735c047ad1b1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 12 Feb 2021 16:49:48 -0500
Subject: NFS: Add support for eager writes

Support eager writing to the server, meaning that we write the data to
cache on the server, and wait for that to complete. This ensures that we
see ENOSPC errors immediately.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/file.c             | 19 +++++++++++++++++--
 fs/nfs/write.c            | 17 ++++++++++++-----
 include/linux/nfs_fs_sb.h |  2 ++
 3 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 03fd1dcc96bd..16ad5050e046 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	unsigned long written = 0;
-	ssize_t result;
+	unsigned int mntflags = NFS_SERVER(inode)->flags;
+	ssize_t result, written;
 	errseq_t since;
 	int error;
 
@@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 
 	written = result;
 	iocb->ki_pos += written;
+
+	if (mntflags & NFS_MOUNT_WRITE_EAGER) {
+		result = filemap_fdatawrite_range(file->f_mapping,
+						  iocb->ki_pos - written,
+						  iocb->ki_pos - 1);
+		if (result < 0)
+			goto out;
+	}
+	if (mntflags & NFS_MOUNT_WRITE_WAIT) {
+		result = filemap_fdatawait_range(file->f_mapping,
+						 iocb->ki_pos - written,
+						 iocb->ki_pos - 1);
+		if (result < 0)
+			goto out;
+	}
 	result = generic_write_sync(iocb, written);
 	if (result < 0)
 		goto out;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6193350356a8..82bdcb982186 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	struct nfs_pageio_descriptor pgio;
-	struct nfs_io_completion *ioc;
+	struct nfs_io_completion *ioc = NULL;
+	unsigned int mntflags = NFS_SERVER(inode)->flags;
+	int priority = 0;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	ioc = nfs_io_completion_alloc(GFP_KERNEL);
-	if (ioc)
-		nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
+	if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
+	    wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+		ioc = nfs_io_completion_alloc(GFP_KERNEL);
+		if (ioc)
+			nfs_io_completion_init(ioc, nfs_io_completion_commit,
+					       inode);
+		priority = wb_priority(wbc);
+	}
 
-	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+	nfs_pageio_init_write(&pgio, inode, priority, false,
 				&nfs_async_write_completion_ops);
 	pgio.pg_io_completion = ioc;
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 962e8313f007..6f76b32a0238 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -153,6 +153,8 @@ struct nfs_server {
 #define NFS_MOUNT_LOCAL_FCNTL		0x200000
 #define NFS_MOUNT_SOFTERR		0x400000
 #define NFS_MOUNT_SOFTREVAL		0x800000
+#define NFS_MOUNT_WRITE_EAGER		0x01000000
+#define NFS_MOUNT_WRITE_WAIT		0x02000000
 
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
-- 
cgit v1.2.3


From 1f975074634a63f014e2b7e76852ee6d6005a91d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 12 Feb 2021 18:10:43 +0100
Subject: libnvdimm: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All drivers return 0 in their remove callback and the driver core ignores
the return value of nvdimm_bus_remove() anyhow. So simplify by changing
the driver remove callback to return void and return 0 unconditionally
to the upper layer.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20210212171043.2136580-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/pmem/compat.c |  3 +--
 drivers/nvdimm/blk.c      |  3 +--
 drivers/nvdimm/bus.c      | 13 +++++--------
 drivers/nvdimm/dimm.c     |  4 +---
 drivers/nvdimm/pmem.c     |  4 +---
 drivers/nvdimm/region.c   |  4 +---
 include/linux/nd.h        |  2 +-
 7 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c
index 863c114fd88c..d81dc35fd65d 100644
--- a/drivers/dax/pmem/compat.c
+++ b/drivers/dax/pmem/compat.c
@@ -41,10 +41,9 @@ static int dax_pmem_compat_release(struct device *dev, void *data)
 	return 0;
 }
 
-static int dax_pmem_compat_remove(struct device *dev)
+static void dax_pmem_compat_remove(struct device *dev)
 {
 	device_for_each_child(dev, NULL, dax_pmem_compat_release);
-	return 0;
 }
 
 static struct nd_device_driver dax_pmem_compat_driver = {
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 22e5617b2cea..8a53728e13e6 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -310,11 +310,10 @@ static int nd_blk_probe(struct device *dev)
 		return nsblk_attach_disk(nsblk);
 }
 
-static int nd_blk_remove(struct device *dev)
+static void nd_blk_remove(struct device *dev)
 {
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
-	return 0;
 }
 
 static struct nd_device_driver nd_blk_driver = {
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2304c6183822..48f0985ca8a0 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -113,18 +113,17 @@ static int nvdimm_bus_remove(struct device *dev)
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
 	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-	int rc = 0;
 
 	if (nd_drv->remove) {
 		debug_nvdimm_lock(dev);
-		rc = nd_drv->remove(dev);
+		nd_drv->remove(dev);
 		debug_nvdimm_unlock(dev);
 	}
 
-	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
-			dev_name(dev), rc);
+	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s)\n", dev->driver->name,
+			dev_name(dev));
 	module_put(provider);
-	return rc;
+	return 0;
 }
 
 static void nvdimm_bus_shutdown(struct device *dev)
@@ -427,7 +426,7 @@ static void free_badrange_list(struct list_head *badrange_list)
 	list_del_init(badrange_list);
 }
 
-static int nd_bus_remove(struct device *dev)
+static void nd_bus_remove(struct device *dev)
 {
 	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 
@@ -446,8 +445,6 @@ static int nd_bus_remove(struct device *dev)
 	spin_unlock(&nvdimm_bus->badrange.lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
-
-	return 0;
 }
 
 static int nd_bus_probe(struct device *dev)
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 94be3ae1d29f..91d9163ee303 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -113,7 +113,7 @@ static int nvdimm_probe(struct device *dev)
 	return rc;
 }
 
-static int nvdimm_remove(struct device *dev)
+static void nvdimm_remove(struct device *dev)
 {
 	struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
 
@@ -121,8 +121,6 @@ static int nvdimm_remove(struct device *dev)
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 	put_ndd(ndd);
-
-	return 0;
 }
 
 static struct nd_device_driver nvdimm_driver = {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 875076b0ea6c..062f0f22bac9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -564,7 +564,7 @@ static int nd_pmem_probe(struct device *dev)
 	return pmem_attach_disk(dev, ndns);
 }
 
-static int nd_pmem_remove(struct device *dev)
+static void nd_pmem_remove(struct device *dev)
 {
 	struct pmem_device *pmem = dev_get_drvdata(dev);
 
@@ -579,8 +579,6 @@ static int nd_pmem_remove(struct device *dev)
 		pmem->bb_state = NULL;
 	}
 	nvdimm_flush(to_nd_region(dev->parent), NULL);
-
-	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index bfce87ed72ab..e0c34120df37 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -87,7 +87,7 @@ static int child_unregister(struct device *dev, void *data)
 	return 0;
 }
 
-static int nd_region_remove(struct device *dev)
+static void nd_region_remove(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
@@ -108,8 +108,6 @@ static int nd_region_remove(struct device *dev)
 	 */
 	sysfs_put(nd_region->bb_state);
 	nd_region->bb_state = NULL;
-
-	return 0;
 }
 
 static int child_notify(struct device *dev, void *data)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 55c735997805..cec526c8043d 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -26,7 +26,7 @@ struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;
 	int (*probe)(struct device *dev);
-	int (*remove)(struct device *dev);
+	void (*remove)(struct device *dev);
 	void (*shutdown)(struct device *dev);
 	void (*notify)(struct device *dev, enum nvdimm_event event);
 };
-- 
cgit v1.2.3


From 4cdadfd5e0a70017fec735b7b6d7f2f731842dc6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 16 Feb 2021 20:09:50 -0800
Subject: cxl/mem: Introduce a driver for CXL-2.0-Type-3 endpoints

The CXL.mem protocol allows a device to act as a provider of "System
RAM" and/or "Persistent Memory" that is fully coherent as if the memory
was attached to the typical CPU memory controller.

With the CXL-2.0 specification a PCI endpoint can implement a "Type-3"
device interface and give the operating system control over "Host
Managed Device Memory". See section 2.3 Type 3 CXL Device.

The memory range exported by the device may optionally be described by
the platform firmware memory map, or by infrastructure like LIBNVDIMM to
provision persistent memory capacity from one, or more, CXL.mem devices.

A pre-requisite for Linux-managed memory-capacity provisioning is this
cxl_mem driver that can speak the mailbox protocol defined in section
8.2.8.4 Mailbox Registers.

For now just land the initial driver boiler-plate and Documentation/
infrastructure.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: David Rientjes <rientjes@google.com> (v1)
Cc: Jonathan Corbet <corbet@lwn.net>
Link: https://www.computeexpresslink.org/download-the-specification
Link: https://lore.kernel.org/r/20210217040958.1354670-2-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/driver-api/cxl/index.rst          | 12 +++++
 Documentation/driver-api/cxl/memory-devices.rst | 15 ++++++
 Documentation/driver-api/index.rst              |  1 +
 drivers/Kconfig                                 |  1 +
 drivers/Makefile                                |  1 +
 drivers/cxl/Kconfig                             | 35 ++++++++++++++
 drivers/cxl/Makefile                            |  4 ++
 drivers/cxl/mem.c                               | 62 +++++++++++++++++++++++++
 drivers/cxl/pci.h                               | 17 +++++++
 include/linux/pci_ids.h                         |  1 +
 10 files changed, 149 insertions(+)
 create mode 100644 Documentation/driver-api/cxl/index.rst
 create mode 100644 Documentation/driver-api/cxl/memory-devices.rst
 create mode 100644 drivers/cxl/Kconfig
 create mode 100644 drivers/cxl/Makefile
 create mode 100644 drivers/cxl/mem.c
 create mode 100644 drivers/cxl/pci.h

(limited to 'include')

diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst
new file mode 100644
index 000000000000..036e49553542
--- /dev/null
+++ b/Documentation/driver-api/cxl/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Compute Express Link
+====================
+
+.. toctree::
+   :maxdepth: 1
+
+   memory-devices
+
+.. only::  subproject and html
diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst
new file mode 100644
index 000000000000..71617e9002f1
--- /dev/null
+++ b/Documentation/driver-api/cxl/memory-devices.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===================================
+Compute Express Link Memory Devices
+===================================
+
+A Compute Express Link Memory Device is a CXL component that implements the
+CXL.mem protocol. It contains some amount of volatile memory, persistent memory,
+or both. It is enumerated as a PCI device for configuration and passing
+messages over an MMIO mailbox. Its contribution to the System Physical
+Address space is handled via HDM (Host Managed Device Memory) decoders
+that optionally define a device's contribution to an interleaved address
+range across multiple devices underneath a host-bridge or interleaved
+across host-bridges.
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 2456d0a97ed8..d246a18fd78f 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -35,6 +35,7 @@ available subsections can be seen below.
    usb/index
    firewire
    pci/index
+   cxl/index
    spi
    i2c
    ipmb
diff --git a/drivers/Kconfig b/drivers/Kconfig
index dcecc9f6e33f..62c753a73651 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -6,6 +6,7 @@ menu "Device Drivers"
 source "drivers/amba/Kconfig"
 source "drivers/eisa/Kconfig"
 source "drivers/pci/Kconfig"
+source "drivers/cxl/Kconfig"
 source "drivers/pcmcia/Kconfig"
 source "drivers/rapidio/Kconfig"
 
diff --git a/drivers/Makefile b/drivers/Makefile
index fd11b9ac4cc3..678ea810410f 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
 obj-$(CONFIG_DAX)		+= dax/
+obj-$(CONFIG_CXL_BUS)		+= cxl/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
new file mode 100644
index 000000000000..9e80b311e928
--- /dev/null
+++ b/drivers/cxl/Kconfig
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig CXL_BUS
+	tristate "CXL (Compute Express Link) Devices Support"
+	depends on PCI
+	help
+	  CXL is a bus that is electrically compatible with PCI Express, but
+	  layers three protocols on that signalling (CXL.io, CXL.cache, and
+	  CXL.mem). The CXL.cache protocol allows devices to hold cachelines
+	  locally, the CXL.mem protocol allows devices to be fully coherent
+	  memory targets, the CXL.io protocol is equivalent to PCI Express.
+	  Say 'y' to enable support for the configuration and management of
+	  devices supporting these protocols.
+
+if CXL_BUS
+
+config CXL_MEM
+	tristate "CXL.mem: Memory Devices"
+	help
+	  The CXL.mem protocol allows a device to act as a provider of
+	  "System RAM" and/or "Persistent Memory" that is fully coherent
+	  as if the memory was attached to the typical CPU memory
+	  controller.
+
+	  Say 'y/m' to enable a driver (named "cxl_mem.ko" when built as
+	  a module) that will attach to CXL.mem devices for
+	  configuration, provisioning, and health monitoring. This
+	  driver is required for dynamic provisioning of CXL.mem
+	  attached memory which is a prerequisite for persistent memory
+	  support. Typically volatile memory is mapped by platform
+	  firmware and included in the platform memory map, but in some
+	  cases the OS is responsible for mapping that memory. See
+	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
+
+	  If unsure say 'm'.
+endif
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
new file mode 100644
index 000000000000..4a30f7c3fc4a
--- /dev/null
+++ b/drivers/cxl/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+
+cxl_mem-y := mem.o
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
new file mode 100644
index 000000000000..ce33c5ee77c9
--- /dev/null
+++ b/drivers/cxl/mem.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include "pci.h"
+
+static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
+{
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DVSEC);
+	if (!pos)
+		return 0;
+
+	while (pos) {
+		u16 vendor, id;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vendor);
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, &id);
+		if (vendor == PCI_DVSEC_VENDOR_ID_CXL && dvsec == id)
+			return pos;
+
+		pos = pci_find_next_ext_capability(pdev, pos,
+						   PCI_EXT_CAP_ID_DVSEC);
+	}
+
+	return 0;
+}
+
+static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	int regloc;
+
+	regloc = cxl_mem_dvsec(pdev, PCI_DVSEC_ID_CXL_REGLOC_OFFSET);
+	if (!regloc) {
+		dev_err(dev, "register location dvsec not found\n");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static const struct pci_device_id cxl_mem_pci_tbl[] = {
+	/* PCI class code for CXL.mem Type-3 Devices */
+	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
+	{ /* terminate list */ },
+};
+MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
+
+static struct pci_driver cxl_mem_driver = {
+	.name			= KBUILD_MODNAME,
+	.id_table		= cxl_mem_pci_tbl,
+	.probe			= cxl_mem_probe,
+	.driver	= {
+		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+
+MODULE_LICENSE("GPL v2");
+module_pci_driver(cxl_mem_driver);
diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h
new file mode 100644
index 000000000000..e464bea3f4d3
--- /dev/null
+++ b/drivers/cxl/pci.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#ifndef __CXL_PCI_H__
+#define __CXL_PCI_H__
+
+#define CXL_MEMORY_PROGIF	0x10
+
+/*
+ * See section 8.1 Configuration Space Registers in the CXL 2.0
+ * Specification
+ */
+#define PCI_DVSEC_VENDOR_ID_CXL		0x1E98
+#define PCI_DVSEC_ID_CXL		0x0
+
+#define PCI_DVSEC_ID_CXL_REGLOC_OFFSET		0x8
+
+#endif /* __CXL_PCI_H__ */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..766260a9b247 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -51,6 +51,7 @@
 #define PCI_BASE_CLASS_MEMORY		0x05
 #define PCI_CLASS_MEMORY_RAM		0x0500
 #define PCI_CLASS_MEMORY_FLASH		0x0501
+#define PCI_CLASS_MEMORY_CXL		0x0502
 #define PCI_CLASS_MEMORY_OTHER		0x0580
 
 #define PCI_BASE_CLASS_BRIDGE		0x06
-- 
cgit v1.2.3


From 583fa5e71caeb79e04e477e9837e2f7fa53b71e4 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:53 -0800
Subject: cxl/mem: Add basic IOCTL interface

Add a straightforward IOCTL that provides a mechanism for userspace to
query the supported memory device commands. CXL commands as they appear
to userspace are described as part of the UAPI kerneldoc. The command
list returned via this IOCTL will contain the full set of commands that
the driver supports, however, some of those commands may not be
available for use by userspace.

Memory device commands first appear in the CXL 2.0 specification. They
are submitted through a mailbox mechanism specified in the CXL 2.0
specification.

The send command allows userspace to issue mailbox commands directly to
the hardware. The list of available commands to send are the output of
the query command. The driver verifies basic properties of the command
and possibly inspect the input (or output) payload to determine whether
or not the command is allowed (or might taint the kernel).

Reported-by: kernel test robot <lkp@intel.com> # bug in earlier revision
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Cc: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20210217040958.1354670-5-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format                                      |   1 +
 Documentation/driver-api/cxl/memory-devices.rst    |  12 +
 Documentation/userspace-api/ioctl/ioctl-number.rst |   1 +
 drivers/cxl/mem.c                                  | 283 ++++++++++++++++++++-
 include/uapi/linux/cxl_mem.h                       | 156 ++++++++++++
 5 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/cxl_mem.h

(limited to 'include')

diff --git a/.clang-format b/.clang-format
index 10dc5a9a61b3..3f11c8901b43 100644
--- a/.clang-format
+++ b/.clang-format
@@ -109,6 +109,7 @@ ForEachMacros:
   - 'css_for_each_child'
   - 'css_for_each_descendant_post'
   - 'css_for_each_descendant_pre'
+  - 'cxl_for_each_cmd'
   - 'device_for_each_child_node'
   - 'dma_fence_chain_for_each'
   - 'do_for_each_ftrace_op'
diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst
index 1fef2c0a167d..1bad466f9167 100644
--- a/Documentation/driver-api/cxl/memory-devices.rst
+++ b/Documentation/driver-api/cxl/memory-devices.rst
@@ -32,3 +32,15 @@ CXL Bus
 -------
 .. kernel-doc:: drivers/cxl/bus.c
    :doc: cxl bus
+
+External Interfaces
+===================
+
+CXL IOCTL Interface
+-------------------
+
+.. kernel-doc:: include/uapi/linux/cxl_mem.h
+   :doc: UAPI
+
+.. kernel-doc:: include/uapi/linux/cxl_mem.h
+   :internal:
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a4c75a28c839..6eb8e634664d 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -352,6 +352,7 @@ Code  Seq#    Include File                                           Comments
                                                                      <mailto:michael.klein@puffin.lb.shuttle.de>
 0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
 0xCD  01     linux/reiserfs_fs.h
+0xCE  01-02  uapi/linux/cxl_mem.h                                    Compute Express Link Memory Devices
 0xCF  02     fs/cifs/ioctl.c
 0xDB  00-0F  drivers/char/mwave/mwavepub.h
 0xDD  00-3F                                                          ZFCP device driver see drivers/s390/scsi/
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 1c0195b07063..aa8f843fcca1 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <uapi/linux/cxl_mem.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/cdev.h>
@@ -40,6 +41,7 @@
 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
 
 enum opcode {
+	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_MAX			= 0x10000
 };
@@ -95,6 +97,49 @@ struct cxl_memdev {
 static int cxl_mem_major;
 static DEFINE_IDA(cxl_memdev_ida);
 
+/**
+ * struct cxl_mem_command - Driver representation of a memory device command
+ * @info: Command information as it exists for the UAPI
+ * @opcode: The actual bits used for the mailbox protocol
+ *
+ * The cxl_mem_command is the driver's internal representation of commands that
+ * are supported by the driver. Some of these commands may not be supported by
+ * the hardware. The driver will use @info to validate the fields passed in by
+ * the user then submit the @opcode to the hardware.
+ *
+ * See struct cxl_command_info.
+ */
+struct cxl_mem_command {
+	struct cxl_command_info info;
+	enum opcode opcode;
+};
+
+#define CXL_CMD(_id, sin, sout)                                                \
+	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
+	.info =	{                                                              \
+			.id = CXL_MEM_COMMAND_ID_##_id,                        \
+			.size_in = sin,                                        \
+			.size_out = sout,                                      \
+		},                                                             \
+	.opcode = CXL_MBOX_OP_##_id,                                           \
+	}
+
+/*
+ * This table defines the supported mailbox commands for the driver. This table
+ * is made up of a UAPI structure. Non-negative values as parameters in the
+ * table will be validated against the user's input. For example, if size_in is
+ * 0, and the user passed in 1, it is an error.
+ */
+static struct cxl_mem_command mem_commands[] = {
+	CXL_CMD(IDENTIFY, 0, 0x43),
+};
+
+#define cxl_for_each_cmd(cmd)                                                  \
+	for ((cmd) = &mem_commands[0];                                         \
+	     ((cmd) - mem_commands) < ARRAY_SIZE(mem_commands); (cmd)++)
+
+#define cxl_cmd_count ARRAY_SIZE(mem_commands)
+
 static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
 {
 	const unsigned long start = jiffies;
@@ -324,6 +369,242 @@ static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
 	mutex_unlock(&cxlm->mbox_mutex);
 }
 
+/**
+ * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
+ * @cxlm: The CXL memory device to communicate with.
+ * @cmd: The validated command.
+ * @in_payload: Pointer to userspace's input payload.
+ * @out_payload: Pointer to userspace's output payload.
+ * @size_out: (Input) Max payload size to copy out.
+ *            (Output) Payload size hardware generated.
+ * @retval: Hardware generated return code from the operation.
+ *
+ * Return:
+ *  * %0	- Mailbox transaction succeeded. This implies the mailbox
+ *		  protocol completed successfully not that the operation itself
+ *		  was successful.
+ *  * %-ENOMEM  - Couldn't allocate a bounce buffer.
+ *  * %-EFAULT	- Something happened with copy_to/from_user.
+ *  * %-EINTR	- Mailbox acquisition interrupted.
+ *  * %-EXXX	- Transaction level failures.
+ *
+ * Creates the appropriate mailbox command and dispatches it on behalf of a
+ * userspace request. The input and output payloads are copied between
+ * userspace.
+ *
+ * See cxl_send_cmd().
+ */
+static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
+					const struct cxl_mem_command *cmd,
+					u64 in_payload, u64 out_payload,
+					s32 *size_out, u32 *retval)
+{
+	struct device *dev = &cxlm->pdev->dev;
+	struct mbox_cmd mbox_cmd = {
+		.opcode = cmd->opcode,
+		.size_in = cmd->info.size_in,
+		.size_out = cmd->info.size_out,
+	};
+	int rc;
+
+	if (cmd->info.size_out) {
+		mbox_cmd.payload_out = kvzalloc(cmd->info.size_out, GFP_KERNEL);
+		if (!mbox_cmd.payload_out)
+			return -ENOMEM;
+	}
+
+	if (cmd->info.size_in) {
+		mbox_cmd.payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
+						   cmd->info.size_in);
+		if (IS_ERR(mbox_cmd.payload_in))
+			return PTR_ERR(mbox_cmd.payload_in);
+	}
+
+	rc = cxl_mem_mbox_get(cxlm);
+	if (rc)
+		goto out;
+
+	dev_dbg(dev,
+		"Submitting %s command for user\n"
+		"\topcode: %x\n"
+		"\tsize: %ub\n",
+		cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
+		cmd->info.size_in);
+
+	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
+	cxl_mem_mbox_put(cxlm);
+	if (rc)
+		goto out;
+
+	/*
+	 * @size_out contains the max size that's allowed to be written back out
+	 * to userspace. While the payload may have written more output than
+	 * this it will have to be ignored.
+	 */
+	if (mbox_cmd.size_out) {
+		dev_WARN_ONCE(dev, mbox_cmd.size_out > *size_out,
+			      "Invalid return size\n");
+		if (copy_to_user(u64_to_user_ptr(out_payload),
+				 mbox_cmd.payload_out, mbox_cmd.size_out)) {
+			rc = -EFAULT;
+			goto out;
+		}
+	}
+
+	*size_out = mbox_cmd.size_out;
+	*retval = mbox_cmd.return_code;
+
+out:
+	kvfree(mbox_cmd.payload_in);
+	kvfree(mbox_cmd.payload_out);
+	return rc;
+}
+
+/**
+ * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
+ * @cxlm: &struct cxl_mem device whose mailbox will be used.
+ * @send_cmd: &struct cxl_send_command copied in from userspace.
+ * @out_cmd: Sanitized and populated &struct cxl_mem_command.
+ *
+ * Return:
+ *  * %0	- @out_cmd is ready to send.
+ *  * %-ENOTTY	- Invalid command specified.
+ *  * %-EINVAL	- Reserved fields or invalid values were used.
+ *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
+ *
+ * The result of this command is a fully validated command in @out_cmd that is
+ * safe to send to the hardware.
+ *
+ * See handle_mailbox_cmd_from_user()
+ */
+static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
+				      const struct cxl_send_command *send_cmd,
+				      struct cxl_mem_command *out_cmd)
+{
+	const struct cxl_command_info *info;
+	struct cxl_mem_command *c;
+
+	if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
+		return -ENOTTY;
+
+	/*
+	 * The user can never specify an input payload larger than what hardware
+	 * supports, but output can be arbitrarily large (simply write out as
+	 * much data as the hardware provides).
+	 */
+	if (send_cmd->in.size > cxlm->payload_size)
+		return -EINVAL;
+
+	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
+		return -EINVAL;
+
+	if (send_cmd->rsvd)
+		return -EINVAL;
+
+	if (send_cmd->in.rsvd || send_cmd->out.rsvd)
+		return -EINVAL;
+
+	/* Convert user's command into the internal representation */
+	c = &mem_commands[send_cmd->id];
+	info = &c->info;
+
+	/* Check the input buffer is the expected size */
+	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
+		return -ENOMEM;
+
+	/* Check the output buffer is at least large enough */
+	if (info->size_out >= 0 && send_cmd->out.size < info->size_out)
+		return -ENOMEM;
+
+	memcpy(out_cmd, c, sizeof(*c));
+	out_cmd->info.size_in = send_cmd->in.size;
+	/*
+	 * XXX: out_cmd->info.size_out will be controlled by the driver, and the
+	 * specified number of bytes @send_cmd->out.size will be copied back out
+	 * to userspace.
+	 */
+
+	return 0;
+}
+
+static int cxl_query_cmd(struct cxl_memdev *cxlmd,
+			 struct cxl_mem_query_commands __user *q)
+{
+	struct device *dev = &cxlmd->dev;
+	struct cxl_mem_command *cmd;
+	u32 n_commands;
+	int j = 0;
+
+	dev_dbg(dev, "Query IOCTL\n");
+
+	if (get_user(n_commands, &q->n_commands))
+		return -EFAULT;
+
+	/* returns the total number if 0 elements are requested. */
+	if (n_commands == 0)
+		return put_user(cxl_cmd_count, &q->n_commands);
+
+	/*
+	 * otherwise, return max(n_commands, total commands) cxl_command_info
+	 * structures.
+	 */
+	cxl_for_each_cmd(cmd) {
+		const struct cxl_command_info *info = &cmd->info;
+
+		if (copy_to_user(&q->commands[j++], info, sizeof(*info)))
+			return -EFAULT;
+
+		if (j == n_commands)
+			break;
+	}
+
+	return 0;
+}
+
+static int cxl_send_cmd(struct cxl_memdev *cxlmd,
+			struct cxl_send_command __user *s)
+{
+	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct device *dev = &cxlmd->dev;
+	struct cxl_send_command send;
+	struct cxl_mem_command c;
+	int rc;
+
+	dev_dbg(dev, "Send IOCTL\n");
+
+	if (copy_from_user(&send, s, sizeof(send)))
+		return -EFAULT;
+
+	rc = cxl_validate_cmd_from_user(cxlmd->cxlm, &send, &c);
+	if (rc)
+		return rc;
+
+	/* Prepare to handle a full payload for variable sized output */
+	if (c.info.size_out < 0)
+		c.info.size_out = cxlm->payload_size;
+
+	rc = handle_mailbox_cmd_from_user(cxlm, &c, send.in.payload,
+					  send.out.payload, &send.out.size,
+					  &send.retval);
+	if (rc)
+		return rc;
+
+	return copy_to_user(s, &send, sizeof(send));
+}
+
+static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
+			       unsigned long arg)
+{
+	switch (cmd) {
+	case CXL_MEM_QUERY_COMMANDS:
+		return cxl_query_cmd(cxlmd, (void __user *)arg);
+	case CXL_MEM_SEND_COMMAND:
+		return cxl_send_cmd(cxlmd, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
 static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
 			     unsigned long arg)
 {
@@ -337,7 +618,7 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
 	if (!percpu_ref_tryget_live(&cxlmd->ops_active))
 		return -ENXIO;
 
-	/* TODO: ioctl body */
+	rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
 
 	percpu_ref_put(&cxlmd->ops_active);
 
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
new file mode 100644
index 000000000000..887781dc3b6c
--- /dev/null
+++ b/include/uapi/linux/cxl_mem.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * CXL IOCTLs for Memory Devices
+ */
+
+#ifndef _UAPI_CXL_MEM_H_
+#define _UAPI_CXL_MEM_H_
+
+#include <linux/types.h>
+
+/**
+ * DOC: UAPI
+ *
+ * Not all of all commands that the driver supports are always available for use
+ * by userspace. Userspace must check the results from the QUERY command in
+ * order to determine the live set of commands.
+ */
+
+#define CXL_MEM_QUERY_COMMANDS _IOR(0xCE, 1, struct cxl_mem_query_commands)
+#define CXL_MEM_SEND_COMMAND _IOWR(0xCE, 2, struct cxl_send_command)
+
+#define CXL_CMDS                                                          \
+	___C(INVALID, "Invalid Command"),                                 \
+	___C(IDENTIFY, "Identify Command"),                               \
+	___C(MAX, "invalid / last command")
+
+#define ___C(a, b) CXL_MEM_COMMAND_ID_##a
+enum { CXL_CMDS };
+
+#undef ___C
+#define ___C(a, b) { b }
+static const struct {
+	const char *name;
+} cxl_command_names[] = { CXL_CMDS };
+
+/*
+ * Here's how this actually breaks out:
+ * cxl_command_names[] = {
+ *	[CXL_MEM_COMMAND_ID_INVALID] = { "Invalid Command" },
+ *	[CXL_MEM_COMMAND_ID_IDENTIFY] = { "Identify Command" },
+ *	...
+ *	[CXL_MEM_COMMAND_ID_MAX] = { "invalid / last command" },
+ * };
+ */
+
+#undef ___C
+
+/**
+ * struct cxl_command_info - Command information returned from a query.
+ * @id: ID number for the command.
+ * @flags: Flags that specify command behavior.
+ * @size_in: Expected input size, or -1 if variable length.
+ * @size_out: Expected output size, or -1 if variable length.
+ *
+ * Represents a single command that is supported by both the driver and the
+ * hardware. This is returned as part of an array from the query ioctl. The
+ * following would be a command that takes a variable length input and returns 0
+ * bytes of output.
+ *
+ *  - @id = 10
+ *  - @flags = 0
+ *  - @size_in = -1
+ *  - @size_out = 0
+ *
+ * See struct cxl_mem_query_commands.
+ */
+struct cxl_command_info {
+	__u32 id;
+
+	__u32 flags;
+#define CXL_MEM_COMMAND_FLAG_MASK GENMASK(0, 0)
+
+	__s32 size_in;
+	__s32 size_out;
+};
+
+/**
+ * struct cxl_mem_query_commands - Query supported commands.
+ * @n_commands: In/out parameter. When @n_commands is > 0, the driver will
+ *		return min(num_support_commands, n_commands). When @n_commands
+ *		is 0, driver will return the number of total supported commands.
+ * @rsvd: Reserved for future use.
+ * @commands: Output array of supported commands. This array must be allocated
+ *            by userspace to be at least min(num_support_commands, @n_commands)
+ *
+ * Allow userspace to query the available commands supported by both the driver,
+ * and the hardware. Commands that aren't supported by either the driver, or the
+ * hardware are not returned in the query.
+ *
+ * Examples:
+ *
+ *  - { .n_commands = 0 } // Get number of supported commands
+ *  - { .n_commands = 15, .commands = buf } // Return first 15 (or less)
+ *    supported commands
+ *
+ *  See struct cxl_command_info.
+ */
+struct cxl_mem_query_commands {
+	/*
+	 * Input: Number of commands to return (space allocated by user)
+	 * Output: Number of commands supported by the driver/hardware
+	 *
+	 * If n_commands is 0, kernel will only return number of commands and
+	 * not try to populate commands[], thus allowing userspace to know how
+	 * much space to allocate
+	 */
+	__u32 n_commands;
+	__u32 rsvd;
+
+	struct cxl_command_info __user commands[]; /* out: supported commands */
+};
+
+/**
+ * struct cxl_send_command - Send a command to a memory device.
+ * @id: The command to send to the memory device. This must be one of the
+ *	commands returned by the query command.
+ * @flags: Flags for the command (input).
+ * @rsvd: Must be zero.
+ * @retval: Return value from the memory device (output).
+ * @in: Parameters associated with input payload.
+ * @in.size: Size of the payload to provide to the device (input).
+ * @in.rsvd: Must be zero.
+ * @in.payload: Pointer to memory for payload input, payload is little endian.
+ * @out: Parameters associated with output payload.
+ * @out.size: Size of the payload received from the device (input/output). This
+ *	      field is filled in by userspace to let the driver know how much
+ *	      space was allocated for output. It is populated by the driver to
+ *	      let userspace know how large the output payload actually was.
+ * @out.rsvd: Must be zero.
+ * @out.payload: Pointer to memory for payload output, payload is little endian.
+ *
+ * Mechanism for userspace to send a command to the hardware for processing. The
+ * driver will do basic validation on the command sizes. In some cases even the
+ * payload may be introspected. Userspace is required to allocate large enough
+ * buffers for size_out which can be variable length in certain situations.
+ */
+struct cxl_send_command {
+	__u32 id;
+	__u32 flags;
+	__u32 rsvd;
+	__u32 retval;
+
+	struct {
+		__s32 size;
+		__u32 rsvd;
+		__u64 payload;
+	} in;
+
+	struct {
+		__s32 size;
+		__u32 rsvd;
+		__u64 payload;
+	} out;
+};
+
+#endif
-- 
cgit v1.2.3


From 13237183c735f5cba4ae26bc782c613ae0d4e4d3 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:54 -0800
Subject: cxl/mem: Add a "RAW" send command

The CXL memory device send interface will have a number of supported
commands. The raw command is not such a command. Raw commands allow
userspace to send a specified opcode to the underlying hardware and
bypass all driver checks on the command. The primary use for this
command is to [begrudgingly] allow undocumented vendor specific hardware
commands.

While not the main motivation, it also allows prototyping new hardware
commands without a driver patch and rebuild.

While this all sounds very powerful it comes with a couple of caveats:
1. Bug reports using raw commands will not get the same level of
   attention as bug reports using supported commands (via taint).
2. Supported commands will be rejected by the RAW command.

With this comes new debugfs knob to allow full access to your toes with
your weapon of choice.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Ariel Sibley <Ariel.Sibley@microchip.com>
Link: https://lore.kernel.org/r/20210217040958.1354670-6-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/Kconfig          |  18 ++++++
 drivers/cxl/mem.c            | 132 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/cxl_mem.h |  12 +++-
 3 files changed, 161 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 9e80b311e928..97dc4d751651 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -32,4 +32,22 @@ config CXL_MEM
 	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
 
 	  If unsure say 'm'.
+
+config CXL_MEM_RAW_COMMANDS
+	bool "RAW Command Interface for Memory Devices"
+	depends on CXL_MEM
+	help
+	  Enable CXL RAW command interface.
+
+	  The CXL driver ioctl interface may assign a kernel ioctl command
+	  number for each specification defined opcode. At any given point in
+	  time the number of opcodes that the specification defines and a device
+	  may implement may exceed the kernel's set of associated ioctl function
+	  numbers. The mismatch is either by omission, specification is too new,
+	  or by design. When prototyping new hardware, or developing / debugging
+	  the driver it is useful to be able to submit any possible command to
+	  the hardware, even commands that may crash the kernel due to their
+	  potential impact to memory currently in use by the kernel.
+
+	  If developing CXL hardware or the driver say Y, otherwise say N.
 endif
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index aa8f843fcca1..5319412e245c 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
 #include <uapi/linux/cxl_mem.h>
+#include <linux/security.h>
+#include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/cdev.h>
@@ -42,7 +44,14 @@
 
 enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
+	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
+	CXL_MBOX_OP_SET_LSA		= 0x4103,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
+	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
+	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
 	CXL_MBOX_OP_MAX			= 0x10000
 };
 
@@ -96,6 +105,8 @@ struct cxl_memdev {
 
 static int cxl_mem_major;
 static DEFINE_IDA(cxl_memdev_ida);
+static struct dentry *cxl_debugfs;
+static bool cxl_raw_allow_all;
 
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
@@ -132,6 +143,49 @@ struct cxl_mem_command {
  */
 static struct cxl_mem_command mem_commands[] = {
 	CXL_CMD(IDENTIFY, 0, 0x43),
+#ifdef CONFIG_CXL_MEM_RAW_COMMANDS
+	CXL_CMD(RAW, ~0, ~0),
+#endif
+};
+
+/*
+ * Commands that RAW doesn't permit. The rationale for each:
+ *
+ * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
+ * coordination of transaction timeout values at the root bridge level.
+ *
+ * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
+ * and needs to be coordinated with HDM updates.
+ *
+ * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
+ * driver and any writes from userspace invalidates those contents.
+ *
+ * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
+ * to the device after it is marked clean, userspace can not make that
+ * assertion.
+ *
+ * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
+ * is kept up to date with patrol notifications and error management.
+ */
+static u16 cxl_disabled_raw_commands[] = {
+	CXL_MBOX_OP_ACTIVATE_FW,
+	CXL_MBOX_OP_SET_PARTITION_INFO,
+	CXL_MBOX_OP_SET_LSA,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+	CXL_MBOX_OP_SCAN_MEDIA,
+	CXL_MBOX_OP_GET_SCAN_MEDIA,
+};
+
+/*
+ * Command sets that RAW doesn't permit. All opcodes in this set are
+ * disabled because they pass plain text security payloads over the
+ * user/kernel boundary. This functionality is intended to be wrapped
+ * behind the keys ABI which allows for encrypted payloads in the UAPI
+ */
+static u8 security_command_sets[] = {
+	0x44, /* Sanitize */
+	0x45, /* Persistent Memory Data-at-rest Security */
+	0x46, /* Security Passthrough */
 };
 
 #define cxl_for_each_cmd(cmd)                                                  \
@@ -162,6 +216,16 @@ static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
 	return 0;
 }
 
+static bool cxl_is_security_command(u16 opcode)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
+		if (security_command_sets[i] == (opcode >> 8))
+			return true;
+	return false;
+}
+
 static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
 				 struct mbox_cmd *mbox_cmd)
 {
@@ -431,6 +495,9 @@ static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
 		cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
 		cmd->info.size_in);
 
+	dev_WARN_ONCE(dev, cmd->info.id == CXL_MEM_COMMAND_ID_RAW,
+		      "raw command path used\n");
+
 	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
 	cxl_mem_mbox_put(cxlm);
 	if (rc)
@@ -460,6 +527,29 @@ out:
 	return rc;
 }
 
+static bool cxl_mem_raw_command_allowed(u16 opcode)
+{
+	int i;
+
+	if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
+		return false;
+
+	if (security_locked_down(LOCKDOWN_NONE))
+		return false;
+
+	if (cxl_raw_allow_all)
+		return true;
+
+	if (cxl_is_security_command(opcode))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
+		if (cxl_disabled_raw_commands[i] == opcode)
+			return false;
+
+	return true;
+}
+
 /**
  * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
  * @cxlm: &struct cxl_mem device whose mailbox will be used.
@@ -471,6 +561,7 @@ out:
  *  * %-ENOTTY	- Invalid command specified.
  *  * %-EINVAL	- Reserved fields or invalid values were used.
  *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
+ *  * %-EPERM	- Attempted to use a protected command.
  *
  * The result of this command is a fully validated command in @out_cmd that is
  * safe to send to the hardware.
@@ -495,6 +586,40 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	if (send_cmd->in.size > cxlm->payload_size)
 		return -EINVAL;
 
+	/*
+	 * Checks are bypassed for raw commands but a WARN/taint will occur
+	 * later in the callchain
+	 */
+	if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) {
+		const struct cxl_mem_command temp = {
+			.info = {
+				.id = CXL_MEM_COMMAND_ID_RAW,
+				.flags = 0,
+				.size_in = send_cmd->in.size,
+				.size_out = send_cmd->out.size,
+			},
+			.opcode = send_cmd->raw.opcode
+		};
+
+		if (send_cmd->raw.rsvd)
+			return -EINVAL;
+
+		/*
+		 * Unlike supported commands, the output size of RAW commands
+		 * gets passed along without further checking, so it must be
+		 * validated here.
+		 */
+		if (send_cmd->out.size > cxlm->payload_size)
+			return -EINVAL;
+
+		if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
+			return -EPERM;
+
+		memcpy(out_cmd, &temp, sizeof(temp));
+
+		return 0;
+	}
+
 	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
 		return -EINVAL;
 
@@ -1166,6 +1291,7 @@ static struct pci_driver cxl_mem_driver = {
 
 static __init int cxl_mem_init(void)
 {
+	struct dentry *mbox_debugfs;
 	dev_t devt;
 	int rc;
 
@@ -1182,11 +1308,17 @@ static __init int cxl_mem_init(void)
 		return rc;
 	}
 
+	cxl_debugfs = debugfs_create_dir("cxl", NULL);
+	mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs);
+	debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
+			    &cxl_raw_allow_all);
+
 	return 0;
 }
 
 static __exit void cxl_mem_exit(void)
 {
+	debugfs_remove_recursive(cxl_debugfs);
 	pci_unregister_driver(&cxl_mem_driver);
 	unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
 }
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 887781dc3b6c..c316028730e7 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -22,6 +22,7 @@
 #define CXL_CMDS                                                          \
 	___C(INVALID, "Invalid Command"),                                 \
 	___C(IDENTIFY, "Identify Command"),                               \
+	___C(RAW, "Raw device command"),                                  \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
@@ -115,6 +116,9 @@ struct cxl_mem_query_commands {
  * @id: The command to send to the memory device. This must be one of the
  *	commands returned by the query command.
  * @flags: Flags for the command (input).
+ * @raw: Special fields for raw commands
+ * @raw.opcode: Opcode passed to hardware when using the RAW command.
+ * @raw.rsvd: Must be zero.
  * @rsvd: Must be zero.
  * @retval: Return value from the memory device (output).
  * @in: Parameters associated with input payload.
@@ -137,7 +141,13 @@ struct cxl_mem_query_commands {
 struct cxl_send_command {
 	__u32 id;
 	__u32 flags;
-	__u32 rsvd;
+	union {
+		struct {
+			__u16 opcode;
+			__u16 rsvd;
+		} raw;
+		__u32 rsvd;
+	};
 	__u32 retval;
 
 	struct {
-- 
cgit v1.2.3


From 472b1ce6e9d6396ab3f11fc5101c6b63b934a018 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:55 -0800
Subject: cxl/mem: Enable commands via CEL

CXL devices identified by the memory-device class code must implement
the Device Command Interface (described in 8.2.9 of the CXL 2.0 spec).
While the driver already maintains a list of commands it supports, there
is still a need to be able to distinguish between commands that the
driver knows about from commands that are optionally supported by the
hardware.

The Command Effects Log (CEL) is specified in the CXL 2.0 specification.
The CEL is one of two types of logs, the other being vendor specific.
They are distinguished in hardware/spec via UUID. The CEL is useful for
2 things:
1. Determine which optional commands are supported by the CXL device.
2. Enumerate any vendor specific commands

The CEL is used by the driver to determine which commands are available
in the hardware and therefore which commands userspace is allowed to
execute. The set of enabled commands might be a subset of commands which
are advertised in UAPI via CXL_MEM_SEND_COMMAND IOCTL.

With the CEL enabling comes a internal flag to indicate a base set of
commands that are enabled regardless of CEL. Such commands are required
for basic interaction with the hardware and thus can be useful in debug
cases, for example if the CEL is corrupted.

The implementation leaves the statically defined table of commands and
supplements it with a bitmap to determine commands that are enabled.
This organization was chosen for the following reasons:
- Smaller memory footprint. Doesn't need a table per device.
- Reduce memory allocation complexity.
- Fixed command IDs to opcode mapping for all devices makes development
  and debugging easier.
- Certain helpers are easily achievable, like cxl_for_each_cmd().

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com> (v2)
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> (v3)
Link: https://lore.kernel.org/r/20210217040958.1354670-7-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/cxl.h            |   2 +
 drivers/cxl/mem.c            | 223 +++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/cxl_mem.h |   1 +
 3 files changed, 219 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 8fd4a177fe25..6f14838c2d25 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -69,6 +69,7 @@ struct cxl_memdev;
  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
  * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
+ * @enabled_commands: Hardware commands found enabled in CEL.
  * @pmem_range: Persistent memory capacity information.
  * @ram_range: Volatile memory capacity information.
  */
@@ -84,6 +85,7 @@ struct cxl_mem {
 	size_t payload_size;
 	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
+	unsigned long *enabled_cmds;
 
 	struct range pmem_range;
 	struct range ram_range;
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 5319412e245c..e31b3045e231 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -46,6 +46,8 @@ enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
+	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
+	CXL_MBOX_OP_GET_LOG		= 0x0401,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
 	CXL_MBOX_OP_SET_LSA		= 0x4103,
@@ -108,10 +110,28 @@ static DEFINE_IDA(cxl_memdev_ida);
 static struct dentry *cxl_debugfs;
 static bool cxl_raw_allow_all;
 
+enum {
+	CEL_UUID,
+	VENDOR_DEBUG_UUID,
+};
+
+/* See CXL 2.0 Table 170. Get Log Input Payload */
+static const uuid_t log_uuid[] = {
+	[CEL_UUID] = UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96,
+			       0xb1, 0x62, 0x3b, 0x3f, 0x17),
+	[VENDOR_DEBUG_UUID] = UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f,
+					0xd6, 0x07, 0x19, 0x40, 0x3d, 0x86),
+};
+
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
  * @info: Command information as it exists for the UAPI
  * @opcode: The actual bits used for the mailbox protocol
+ * @flags: Set of flags effecting driver behavior.
+ *
+ *  * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
+ *    will be enabled by the driver regardless of what hardware may have
+ *    advertised.
  *
  * The cxl_mem_command is the driver's internal representation of commands that
  * are supported by the driver. Some of these commands may not be supported by
@@ -123,9 +143,12 @@ static bool cxl_raw_allow_all;
 struct cxl_mem_command {
 	struct cxl_command_info info;
 	enum opcode opcode;
+	u32 flags;
+#define CXL_CMD_FLAG_NONE 0
+#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
 };
 
-#define CXL_CMD(_id, sin, sout)                                                \
+#define CXL_CMD(_id, sin, sout, _flags)                                        \
 	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
 	.info =	{                                                              \
 			.id = CXL_MEM_COMMAND_ID_##_id,                        \
@@ -133,6 +156,7 @@ struct cxl_mem_command {
 			.size_out = sout,                                      \
 		},                                                             \
 	.opcode = CXL_MBOX_OP_##_id,                                           \
+	.flags = _flags,                                                       \
 	}
 
 /*
@@ -142,10 +166,11 @@ struct cxl_mem_command {
  * 0, and the user passed in 1, it is an error.
  */
 static struct cxl_mem_command mem_commands[] = {
-	CXL_CMD(IDENTIFY, 0, 0x43),
+	CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
 #ifdef CONFIG_CXL_MEM_RAW_COMMANDS
-	CXL_CMD(RAW, ~0, ~0),
+	CXL_CMD(RAW, ~0, ~0, 0),
 #endif
+	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
 };
 
 /*
@@ -633,6 +658,10 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	c = &mem_commands[send_cmd->id];
 	info = &c->info;
 
+	/* Check that the command is enabled for hardware */
+	if (!test_bit(info->id, cxlm->enabled_cmds))
+		return -ENOTTY;
+
 	/* Check the input buffer is the expected size */
 	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
 		return -ENOMEM;
@@ -757,6 +786,17 @@ static const struct file_operations cxl_memdev_fops = {
 	.llseek = noop_llseek,
 };
 
+static inline struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
+{
+	struct cxl_mem_command *c;
+
+	cxl_for_each_cmd(c)
+		if (c->opcode == opcode)
+			return c;
+
+	return NULL;
+}
+
 /**
  * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
  * @cxlm: The CXL memory device to communicate with.
@@ -777,9 +817,7 @@ static const struct file_operations cxl_memdev_fops = {
  *
  * Mailbox commands may execute successfully yet the device itself reported an
  * error. While this distinction can be useful for commands from userspace, the
- * kernel will only be able to use results when both are successful. It's
- * expected that all callers of this function know exactly the size of the data
- * they will consume from the hardware.
+ * kernel will only be able to use results when both are successful.
  *
  * See __cxl_mem_mbox_send_cmd()
  */
@@ -787,6 +825,7 @@ static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
 				 void *in, size_t in_size,
 				 void *out, size_t out_size)
 {
+	const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
 	struct mbox_cmd mbox_cmd = {
 		.opcode = opcode,
 		.payload_in = in,
@@ -812,7 +851,11 @@ static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
 	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS)
 		return -ENXIO;
 
-	if (mbox_cmd.size_out != out_size)
+	/*
+	 * Variable sized commands can't be validated and so it's up to the
+	 * caller to do that if they wish.
+	 */
+	if (cmd->info.size_out >= 0 && mbox_cmd.size_out != out_size)
 		return -EIO;
 
 	return 0;
@@ -947,6 +990,14 @@ static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
 	mutex_init(&cxlm->mbox_mutex);
 	cxlm->pdev = pdev;
 	cxlm->regs = regs + offset;
+	cxlm->enabled_cmds =
+		devm_kmalloc_array(dev, BITS_TO_LONGS(cxl_cmd_count),
+				   sizeof(unsigned long),
+				   GFP_KERNEL | __GFP_ZERO);
+	if (!cxlm->enabled_cmds) {
+		dev_err(dev, "No memory available for bitmap\n");
+		return NULL;
+	}
 
 	dev_dbg(dev, "Mapped CXL Memory Device resource\n");
 	return cxlm;
@@ -1166,6 +1217,160 @@ err_ref:
 	return rc;
 }
 
+static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
+{
+	u32 remaining = size;
+	u32 offset = 0;
+
+	while (remaining) {
+		u32 xfer_size = min_t(u32, remaining, cxlm->payload_size);
+		struct cxl_mbox_get_log {
+			uuid_t uuid;
+			__le32 offset;
+			__le32 length;
+		} __packed log = {
+			.uuid = *uuid,
+			.offset = cpu_to_le32(offset),
+			.length = cpu_to_le32(xfer_size)
+		};
+		int rc;
+
+		rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LOG, &log,
+					   sizeof(log), out, xfer_size);
+		if (rc < 0)
+			return rc;
+
+		out += xfer_size;
+		remaining -= xfer_size;
+		offset += xfer_size;
+	}
+
+	return 0;
+}
+
+/**
+ * cxl_walk_cel() - Walk through the Command Effects Log.
+ * @cxlm: Device.
+ * @size: Length of the Command Effects Log.
+ * @cel: CEL
+ *
+ * Iterate over each entry in the CEL and determine if the driver supports the
+ * command. If so, the command is enabled for the device and can be used later.
+ */
+static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
+{
+	struct cel_entry {
+		__le16 opcode;
+		__le16 effect;
+	} __packed * cel_entry;
+	const int cel_entries = size / sizeof(*cel_entry);
+	int i;
+
+	cel_entry = (struct cel_entry *)cel;
+
+	for (i = 0; i < cel_entries; i++) {
+		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
+		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
+
+		if (!cmd) {
+			dev_dbg(&cxlm->pdev->dev,
+				"Opcode 0x%04x unsupported by driver", opcode);
+			continue;
+		}
+
+		set_bit(cmd->info.id, cxlm->enabled_cmds);
+	}
+}
+
+struct cxl_mbox_get_supported_logs {
+	__le16 entries;
+	u8 rsvd[6];
+	struct gsl_entry {
+		uuid_t uuid;
+		__le32 size;
+	} __packed entry[];
+} __packed;
+
+static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_mem *cxlm)
+{
+	struct cxl_mbox_get_supported_logs *ret;
+	int rc;
+
+	ret = kvmalloc(cxlm->payload_size, GFP_KERNEL);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL,
+				   0, ret, cxlm->payload_size);
+	if (rc < 0) {
+		kvfree(ret);
+		return ERR_PTR(rc);
+	}
+
+	return ret;
+}
+
+/**
+ * cxl_mem_enumerate_cmds() - Enumerate commands for a device.
+ * @cxlm: The device.
+ *
+ * Returns 0 if enumerate completed successfully.
+ *
+ * CXL devices have optional support for certain commands. This function will
+ * determine the set of supported commands for the hardware and update the
+ * enabled_cmds bitmap in the @cxlm.
+ */
+static int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
+{
+	struct cxl_mbox_get_supported_logs *gsl;
+	struct device *dev = &cxlm->pdev->dev;
+	struct cxl_mem_command *cmd;
+	int i, rc;
+
+	gsl = cxl_get_gsl(cxlm);
+	if (IS_ERR(gsl))
+		return PTR_ERR(gsl);
+
+	rc = -ENOENT;
+	for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
+		u32 size = le32_to_cpu(gsl->entry[i].size);
+		uuid_t uuid = gsl->entry[i].uuid;
+		u8 *log;
+
+		dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
+
+		if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
+			continue;
+
+		log = kvmalloc(size, GFP_KERNEL);
+		if (!log) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		rc = cxl_xfer_log(cxlm, &uuid, size, log);
+		if (rc) {
+			kvfree(log);
+			goto out;
+		}
+
+		cxl_walk_cel(cxlm, size, log);
+		kvfree(log);
+
+		/* In case CEL was bogus, enable some default commands. */
+		cxl_for_each_cmd(cmd)
+			if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
+				set_bit(cmd->info.id, cxlm->enabled_cmds);
+
+		/* Found the required CEL */
+		rc = 0;
+	}
+
+out:
+	kvfree(gsl);
+	return rc;
+}
+
 /**
  * cxl_mem_identify() - Send the IDENTIFY command to the device.
  * @cxlm: The device to identify.
@@ -1266,6 +1471,10 @@ static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	rc = cxl_mem_enumerate_cmds(cxlm);
+	if (rc)
+		return rc;
+
 	rc = cxl_mem_identify(cxlm);
 	if (rc)
 		return rc;
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index c316028730e7..59227f82a4c1 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -23,6 +23,7 @@
 	___C(INVALID, "Invalid Command"),                                 \
 	___C(IDENTIFY, "Identify Command"),                               \
 	___C(RAW, "Raw device command"),                                  \
+	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
-- 
cgit v1.2.3


From 57ee605b976c30a86613648935d255bbe704aeab Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Tue, 16 Feb 2021 20:09:56 -0800
Subject: cxl/mem: Add set of informational commands

Add initial set of formal commands beyond basic identify and command
enumeration.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> (v2)
Link: https://lore.kernel.org/r/20210217040958.1354670-8-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/mem.c            | 9 +++++++++
 include/uapi/linux/cxl_mem.h | 5 +++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index e31b3045e231..6d7d3870b5da 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -45,12 +45,16 @@
 enum opcode {
 	CXL_MBOX_OP_INVALID		= 0x0000,
 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
 	CXL_MBOX_OP_GET_LOG		= 0x0401,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
+	CXL_MBOX_OP_GET_LSA		= 0x4102,
 	CXL_MBOX_OP_SET_LSA		= 0x4103,
+	CXL_MBOX_OP_GET_HEALTH_INFO	= 0x4200,
 	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
 	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
 	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
@@ -171,6 +175,11 @@ static struct cxl_mem_command mem_commands[] = {
 	CXL_CMD(RAW, ~0, ~0, 0),
 #endif
 	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
+	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
+	CXL_CMD(GET_LSA, 0x8, ~0, 0),
+	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
+	CXL_CMD(GET_LOG, 0x18, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
 };
 
 /*
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 59227f82a4c1..3155382dfc9b 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -24,6 +24,11 @@
 	___C(IDENTIFY, "Identify Command"),                               \
 	___C(RAW, "Raw device command"),                                  \
 	___C(GET_SUPPORTED_LOGS, "Get Supported Logs"),                   \
+	___C(GET_FW_INFO, "Get FW Info"),                                 \
+	___C(GET_PARTITION_INFO, "Get Partition Information"),            \
+	___C(GET_LSA, "Get Label Storage Area"),                          \
+	___C(GET_HEALTH_INFO, "Get Health Info"),                         \
+	___C(GET_LOG, "Get Log"),                                         \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
-- 
cgit v1.2.3


From fade5cad9339a627c5ad029e3577582b6292df03 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 15 Jan 2021 13:46:03 +0800
Subject: initrd: Add the preprocessor guard in initrd.h

Add the preprocessor guard in initrd.h to prevent possible
build error from the multiple inclusion of same header file
multiple time.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 include/linux/initrd.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index 8db6f8c8030b..fc30ac30e10e 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#ifndef __LINUX_INITRD_H
+#define __LINUX_INITRD_H
+
 #define INITRD_MINOR 250 /* shouldn't collide with /dev/ram* too soon ... */
 
 /* starting block # of image */
@@ -24,3 +27,5 @@ extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 
 void console_on_rootfs(void);
+
+#endif /* __LINUX_INITRD_H */
-- 
cgit v1.2.3


From c72160fe05fb978ad859ba053c4462c2bb960b13 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 15 Jan 2021 13:46:04 +0800
Subject: initramfs: Provide a common initrd reserve function

Some architectures(eg, ARM and riscv) have similar logic to
check and reserve the memory of initrd, let's provide a common
function reserve_initrd_mem() to reduce duplicated code.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 include/linux/initrd.h |  6 ++++++
 init/initramfs.c       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index fc30ac30e10e..85c15717af34 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -18,6 +18,12 @@ extern int initrd_below_start_ok;
 extern unsigned long initrd_start, initrd_end;
 extern void free_initrd_mem(unsigned long, unsigned long);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+extern void __init reserve_initrd_mem(void);
+#else
+static inline void __init reserve_initrd_mem(void) {}
+#endif
+
 extern phys_addr_t phys_initrd_start;
 extern unsigned long phys_initrd_size;
 
diff --git a/init/initramfs.c b/init/initramfs.c
index 55b74d7e5260..f75c89e9d602 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -535,6 +535,51 @@ extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
 #include <linux/kexec.h>
 
+void __init reserve_initrd_mem(void)
+{
+	phys_addr_t start;
+	unsigned long size;
+
+	/* Ignore the virtul address computed during device tree parsing */
+	initrd_start = initrd_end = 0;
+
+	if (!phys_initrd_size)
+		return;
+	/*
+	 * Round the memory region to page boundaries as per free_initrd_mem()
+	 * This allows us to detect whether the pages overlapping the initrd
+	 * are in use, but more importantly, reserves the entire set of pages
+	 * as we don't want these pages allocated for other purposes.
+	 */
+	start = round_down(phys_initrd_start, PAGE_SIZE);
+	size = phys_initrd_size + (phys_initrd_start - start);
+	size = round_up(size, PAGE_SIZE);
+
+	if (!memblock_is_region_memory(start, size)) {
+		pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region",
+		       (u64)start, size);
+		goto disable;
+	}
+
+	if (memblock_is_region_reserved(start, size)) {
+		pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n",
+		       (u64)start, size);
+		goto disable;
+	}
+
+	memblock_reserve(start, size);
+	/* Now convert initrd to virtual addresses */
+	initrd_start = (unsigned long)__va(phys_initrd_start);
+	initrd_end = initrd_start + phys_initrd_size;
+	initrd_below_start_ok = 1;
+
+	return;
+disable:
+	pr_cont(" - disabling initrd\n");
+	initrd_start = 0;
+	initrd_end = 0;
+}
+
 void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
 {
 #ifdef CONFIG_ARCH_KEEP_MEMBLOCK
-- 
cgit v1.2.3


From af0bfab907a011e146304d20d81dddce4e4d62d0 Mon Sep 17 00:00:00 2001
From: Abanoub Sameh <abanoubsameh8@gmail.com>
Date: Fri, 11 Dec 2020 22:42:08 +0200
Subject: leds: led-core: Get rid of enum led_brightness

This gets rid of enum led_brightness in the main led files,
because it is deprecated, and an unsigned int can be used instead.

We can get rid of led_brightness completely and
patches can also be supplied for the other drivers' files.

Signed-off-by: Abanoub Sameh <abanoubsameh@protonmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 drivers/leds/led-class.c |  3 +--
 drivers/leds/led-core.c  | 20 +++++++-------------
 drivers/leds/leds.h      |  6 ++----
 include/linux/leds.h     | 12 +++++-------
 4 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 131ca83f5fb3..2e495ff67856 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -145,8 +145,7 @@ static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev)
 	device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed);
 }
 
-void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev,
-					       enum led_brightness brightness)
+void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness)
 {
 	if (WARN_ON(!led_cdev->brightness_hw_changed_kn))
 		return;
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index c4e780bdb385..8eb8054ef9c6 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -39,8 +39,7 @@ const char * const led_colors[LED_COLOR_ID_MAX] = {
 };
 EXPORT_SYMBOL_GPL(led_colors);
 
-static int __led_set_brightness(struct led_classdev *led_cdev,
-				enum led_brightness value)
+static int __led_set_brightness(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (!led_cdev->brightness_set)
 		return -ENOTSUPP;
@@ -50,8 +49,7 @@ static int __led_set_brightness(struct led_classdev *led_cdev,
 	return 0;
 }
 
-static int __led_set_brightness_blocking(struct led_classdev *led_cdev,
-					 enum led_brightness value)
+static int __led_set_brightness_blocking(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (!led_cdev->brightness_set_blocking)
 		return -ENOTSUPP;
@@ -240,8 +238,7 @@ void led_stop_software_blink(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_stop_software_blink);
 
-void led_set_brightness(struct led_classdev *led_cdev,
-			enum led_brightness brightness)
+void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness)
 {
 	/*
 	 * If software blink is active, delay brightness setting
@@ -253,7 +250,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
 		 * work queue task to avoid problems in case we are called
 		 * from hard irq context.
 		 */
-		if (brightness == LED_OFF) {
+		if (!brightness) {
 			set_bit(LED_BLINK_DISABLE, &led_cdev->work_flags);
 			schedule_work(&led_cdev->set_brightness_work);
 		} else {
@@ -268,8 +265,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness);
 
-void led_set_brightness_nopm(struct led_classdev *led_cdev,
-			      enum led_brightness value)
+void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value)
 {
 	/* Use brightness_set op if available, it is guaranteed not to sleep */
 	if (!__led_set_brightness(led_cdev, value))
@@ -281,8 +277,7 @@ void led_set_brightness_nopm(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness_nopm);
 
-void led_set_brightness_nosleep(struct led_classdev *led_cdev,
-				enum led_brightness value)
+void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value)
 {
 	led_cdev->brightness = min(value, led_cdev->max_brightness);
 
@@ -293,8 +288,7 @@ void led_set_brightness_nosleep(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness_nosleep);
 
-int led_set_brightness_sync(struct led_classdev *led_cdev,
-			    enum led_brightness value)
+int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (led_cdev->blink_delay_on || led_cdev->blink_delay_off)
 		return -EBUSY;
diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index 2d9eb48bbed9..345062ccabda 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -19,10 +19,8 @@ static inline int led_get_brightness(struct led_classdev *led_cdev)
 
 void led_init_core(struct led_classdev *led_cdev);
 void led_stop_software_blink(struct led_classdev *led_cdev);
-void led_set_brightness_nopm(struct led_classdev *led_cdev,
-				enum led_brightness value);
-void led_set_brightness_nosleep(struct led_classdev *led_cdev,
-				enum led_brightness value);
+void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value);
+void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value);
 ssize_t led_trigger_read(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *attr, char *buf,
 			loff_t pos, size_t count);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6a8d6409c993..329fd914cf24 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -63,8 +63,8 @@ struct led_hw_trigger_type {
 
 struct led_classdev {
 	const char		*name;
-	enum led_brightness	 brightness;
-	enum led_brightness	 max_brightness;
+	unsigned int brightness;
+	unsigned int max_brightness;
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
@@ -253,8 +253,7 @@ void led_blink_set_oneshot(struct led_classdev *led_cdev,
  * software blink timer that implements blinking when the
  * hardware doesn't. This function is guaranteed not to sleep.
  */
-void led_set_brightness(struct led_classdev *led_cdev,
-			enum led_brightness brightness);
+void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness);
 
 /**
  * led_set_brightness_sync - set LED brightness synchronously
@@ -267,8 +266,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
  *
  * Returns: 0 on success or negative error value on failure
  */
-int led_set_brightness_sync(struct led_classdev *led_cdev,
-			    enum led_brightness value);
+int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value);
 
 /**
  * led_update_brightness - update LED brightness
@@ -565,7 +563,7 @@ static inline void ledtrig_cpu(enum cpu_led_event evt)
 
 #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
 void led_classdev_notify_brightness_hw_changed(
-	struct led_classdev *led_cdev, enum led_brightness brightness);
+	struct led_classdev *led_cdev, unsigned int brightness);
 #else
 static inline void led_classdev_notify_brightness_hw_changed(
 	struct led_classdev *led_cdev, enum led_brightness brightness) { }
-- 
cgit v1.2.3


From 8e5c38a33c84935d66cfcf23c96960b6c4b484ef Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Mon, 21 Dec 2020 18:45:50 +0800
Subject: leds: flash: Add flash registration with undefined
 CONFIG_LEDS_CLASS_FLASH

Add flash registration with undefined CONFIG_LEDS_CLASS_FLASH,
and move the same registration functions outside of #ifdef block.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 include/linux/led-class-flash.h | 42 ++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
index 21a3358a1731..612b4cab3819 100644
--- a/include/linux/led-class-flash.h
+++ b/include/linux/led-class-flash.h
@@ -85,6 +85,7 @@ static inline struct led_classdev_flash *lcdev_to_flcdev(
 	return container_of(lcdev, struct led_classdev_flash, led_cdev);
 }
 
+#if IS_ENABLED(CONFIG_LEDS_CLASS_FLASH)
 /**
  * led_classdev_flash_register_ext - register a new object of LED class with
  *				     init data and with support for flash LEDs
@@ -98,12 +99,6 @@ int led_classdev_flash_register_ext(struct device *parent,
 				    struct led_classdev_flash *fled_cdev,
 				    struct led_init_data *init_data);
 
-static inline int led_classdev_flash_register(struct device *parent,
-					   struct led_classdev_flash *fled_cdev)
-{
-	return led_classdev_flash_register_ext(parent, fled_cdev, NULL);
-}
-
 /**
  * led_classdev_flash_unregister - unregisters an object of led_classdev class
  *				   with support for flash LEDs
@@ -118,15 +113,44 @@ int devm_led_classdev_flash_register_ext(struct device *parent,
 				     struct led_init_data *init_data);
 
 
+void devm_led_classdev_flash_unregister(struct device *parent,
+					struct led_classdev_flash *fled_cdev);
+
+#else
+
+static inline int led_classdev_flash_register_ext(struct device *parent,
+				    struct led_classdev_flash *fled_cdev,
+				    struct led_init_data *init_data)
+{
+	return 0;
+}
+
+static inline void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) {};
+static inline int devm_led_classdev_flash_register_ext(struct device *parent,
+				     struct led_classdev_flash *fled_cdev,
+				     struct led_init_data *init_data)
+{
+	return 0;
+}
+
+static inline void devm_led_classdev_flash_unregister(struct device *parent,
+					struct led_classdev_flash *fled_cdev)
+{};
+
+#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) */
+
+static inline int led_classdev_flash_register(struct device *parent,
+					   struct led_classdev_flash *fled_cdev)
+{
+	return led_classdev_flash_register_ext(parent, fled_cdev, NULL);
+}
+
 static inline int devm_led_classdev_flash_register(struct device *parent,
 				     struct led_classdev_flash *fled_cdev)
 {
 	return devm_led_classdev_flash_register_ext(parent, fled_cdev, NULL);
 }
 
-void devm_led_classdev_flash_unregister(struct device *parent,
-					struct led_classdev_flash *fled_cdev);
-
 /**
  * led_set_flash_strobe - setup flash strobe
  * @fled_cdev: the flash LED to set strobe on
-- 
cgit v1.2.3


From 6039b7e87be0b350a5f8fc135adfb5d1f4ba66ad Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Mon, 21 Dec 2020 18:45:51 +0800
Subject: leds: flash: Fix multicolor no-ops registration by return 0

Fix multicolor no-ops registration by return 0,
and move the same registration functions outside of #ifdef block.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 include/linux/led-class-multicolor.h | 42 +++++++++++++-----------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h
index 5116f9a866cc..210d57bcd767 100644
--- a/include/linux/led-class-multicolor.h
+++ b/include/linux/led-class-multicolor.h
@@ -44,12 +44,6 @@ int led_classdev_multicolor_register_ext(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev,
 					    struct led_init_data *init_data);
 
-static inline int led_classdev_multicolor_register(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{
-	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
-}
-
 /**
  * led_classdev_multicolor_unregister - unregisters an object of led_classdev
  *					class with support for multicolor LEDs
@@ -68,13 +62,6 @@ int devm_led_classdev_multicolor_register_ext(struct device *parent,
 					  struct led_classdev_mc *mcled_cdev,
 					  struct led_init_data *init_data);
 
-static inline int devm_led_classdev_multicolor_register(struct device *parent,
-				     struct led_classdev_mc *mcled_cdev)
-{
-	return devm_led_classdev_multicolor_register_ext(parent, mcled_cdev,
-							 NULL);
-}
-
 void devm_led_classdev_multicolor_unregister(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev);
 #else
@@ -83,27 +70,33 @@ static inline int led_classdev_multicolor_register_ext(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev,
 					    struct led_init_data *init_data)
 {
-	return -EINVAL;
-}
-
-static inline int led_classdev_multicolor_register(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{
-	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
+	return 0;
 }
 
 static inline void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev) {};
 static inline int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
 					       enum led_brightness brightness)
 {
-	return -EINVAL;
+	return 0;
 }
 
 static inline int devm_led_classdev_multicolor_register_ext(struct device *parent,
 					  struct led_classdev_mc *mcled_cdev,
 					  struct led_init_data *init_data)
 {
-	return -EINVAL;
+	return 0;
+}
+
+static inline void devm_led_classdev_multicolor_unregister(struct device *parent,
+					    struct led_classdev_mc *mcled_cdev)
+{};
+
+#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */
+
+static inline int led_classdev_multicolor_register(struct device *parent,
+					    struct led_classdev_mc *mcled_cdev)
+{
+	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
 }
 
 static inline int devm_led_classdev_multicolor_register(struct device *parent,
@@ -113,9 +106,4 @@ static inline int devm_led_classdev_multicolor_register(struct device *parent,
 							 NULL);
 }
 
-static inline void devm_led_classdev_multicolor_unregister(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{};
-
-#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */
 #endif	/* _LINUX_MULTICOLOR_LEDS_H_INCLUDED */
-- 
cgit v1.2.3


From 36950f2da1ea4cb683be174f6f581e25b2d33e71 Mon Sep 17 00:00:00 2001
From: Jianxiong Gao <jxgao@google.com>
Date: Mon, 1 Feb 2021 10:30:15 -0800
Subject: driver core: add a min_align_mask field to struct
 device_dma_parameters

Some devices rely on the address offset in a page to function
correctly (NVMe driver as an example). These devices may use
a different page size than the Linux kernel. The address offset
has to be preserved upon mapping, and in order to do so, we
need to record the page_offset_mask first.

Signed-off-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/device.h      |  1 +
 include/linux/dma-mapping.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 1779f90eeb4c..7960bf516dd7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -291,6 +291,7 @@ struct device_dma_parameters {
 	 * sg limitations.
 	 */
 	unsigned int max_segment_size;
+	unsigned int min_align_mask;
 	unsigned long segment_boundary_mask;
 };
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2e49996a8f39..9c26225754e7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -500,6 +500,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 	return -EIO;
 }
 
+static inline unsigned int dma_get_min_align_mask(struct device *dev)
+{
+	if (dev->dma_parms)
+		return dev->dma_parms->min_align_mask;
+	return 0;
+}
+
+static inline int dma_set_min_align_mask(struct device *dev,
+		unsigned int min_align_mask)
+{
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return -EIO;
+	dev->dma_parms->min_align_mask = min_align_mask;
+	return 0;
+}
+
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
-- 
cgit v1.2.3


From b5d7ccb7aac3895c2138fe0980a109116ce15eff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Feb 2021 11:18:40 +0100
Subject: swiotlb: add a IO_TLB_SIZE define

Add a new IO_TLB_SIZE define instead open coding it using
IO_TLB_SHIFT all over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h |  1 +
 kernel/dma/swiotlb.c    | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d9c9fc9ca5d2..5857a937c637 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,6 +29,7 @@ enum swiotlb_force {
  * controllable.
  */
 #define IO_TLB_SHIFT 11
+#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
 
 /* default to 64MB */
 #define IO_TLB_DEFAULT_SIZE (64UL<<20)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..768187d0e17e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -171,7 +171,7 @@ void __init swiotlb_adjust_size(unsigned long new_size)
 	 * adjust/expand SWIOTLB size for their use.
 	 */
 	if (!io_tlb_nslabs) {
-		size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+		size = ALIGN(new_size, IO_TLB_SIZE);
 		io_tlb_nslabs = size >> IO_TLB_SHIFT;
 		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 
@@ -491,20 +491,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 
 	tbl_dma_addr &= mask;
 
-	offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 
 	/*
 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
 	 */
 	max_slots = mask + 1
-		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+		    ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
 
 	/*
 	 * For mappings greater than or equal to a page, we limit the stride
 	 * (and hence alignment) to a page size.
 	 */
-	nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	if (alloc_size >= PAGE_SIZE)
 		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
 	else
@@ -598,7 +598,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
@@ -649,7 +649,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
-	orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+	orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -707,7 +707,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 size_t swiotlb_max_mapping_size(struct device *dev)
 {
-	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
 bool is_swiotlb_active(void)
-- 
cgit v1.2.3


From 5aa75ed5b93f086c455a3c67239b0471ff5a1526 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Feb 2021 12:56:50 -0700
Subject: io_uring: tie async worker side to the task context

Move it outside of the io_ring_ctx, and tie it to the io_uring task
context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 84 ++++++++++++++++++++----------------------------
 include/linux/io_uring.h |  1 +
 2 files changed, 35 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d6c2ff6124fd..31402a19fca6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -366,9 +366,6 @@ struct io_ring_ctx {
 
 	struct io_rings	*rings;
 
-	/* IO offload */
-	struct io_wq		*io_wq;
-
 	/*
 	 * For SQPOLL usage - we hold a reference to the parent task, so we
 	 * have access to the ->files
@@ -1634,10 +1631,11 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link = io_prep_linked_timeout(req);
+	struct io_uring_task *tctx = req->task->io_uring;
 
 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 					&req->work, req->flags);
-	io_wq_enqueue(ctx->io_wq, &req->work);
+	io_wq_enqueue(tctx->io_wq, &req->work);
 	return link;
 }
 
@@ -5960,12 +5958,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	return req->user_data == (unsigned long) data;
 }
 
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
 {
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
 
-	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+	if (!tctx->io_wq)
+		return -ENOENT;
+
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -5988,7 +5989,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	unsigned long flags;
 	int ret;
 
-	ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+	ret = io_async_cancel_one(req->task->io_uring,
+					(void *) (unsigned long) sqe_addr);
 	if (ret != -ENOENT) {
 		spin_lock_irqsave(&ctx->completion_lock, flags);
 		goto done;
@@ -7537,16 +7539,6 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 	}
 }
 
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
-	io_sq_thread_stop(ctx);
-
-	if (ctx->io_wq) {
-		io_wq_destroy(ctx->io_wq);
-		ctx->io_wq = NULL;
-	}
-}
-
 #if defined(CONFIG_UNIX)
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
@@ -8105,11 +8097,10 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
 	return req ? &req->work : NULL;
 }
 
-static int io_init_wq_offload(struct io_ring_ctx *ctx)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 {
 	struct io_wq_data data;
 	unsigned int concurrency;
-	int ret = 0;
 
 	data.user = ctx->user;
 	data.free_work = io_free_work;
@@ -8118,16 +8109,11 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx)
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 
-	ctx->io_wq = io_wq_create(concurrency, &data);
-	if (IS_ERR(ctx->io_wq)) {
-		ret = PTR_ERR(ctx->io_wq);
-		ctx->io_wq = NULL;
-	}
-
-	return ret;
+	return io_wq_create(concurrency, &data);
 }
 
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+				       struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx;
 	int ret;
@@ -8142,6 +8128,14 @@ static int io_uring_alloc_task_context(struct task_struct *task)
 		return ret;
 	}
 
+	tctx->io_wq = io_init_wq_offload(ctx);
+	if (IS_ERR(tctx->io_wq)) {
+		ret = PTR_ERR(tctx->io_wq);
+		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx);
+		return ret;
+	}
+
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	tctx->last = NULL;
@@ -8214,7 +8208,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			ctx->sq_thread_idle = HZ;
 
 		if (sqd->thread)
-			goto done;
+			return 0;
 
 		if (p->flags & IORING_SETUP_SQ_AFF) {
 			int cpu = p->sq_thread_cpu;
@@ -8236,7 +8230,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			sqd->thread = NULL;
 			goto err;
 		}
-		ret = io_uring_alloc_task_context(sqd->thread);
+		ret = io_uring_alloc_task_context(sqd->thread, ctx);
 		if (ret)
 			goto err;
 	} else if (p->flags & IORING_SETUP_SQ_AFF) {
@@ -8245,14 +8239,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
-done:
-	ret = io_init_wq_offload(ctx);
-	if (ret)
-		goto err;
-
 	return 0;
 err:
-	io_finish_async(ctx);
+	io_sq_thread_stop(ctx);
 	return ret;
 }
 
@@ -8727,7 +8716,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	mutex_lock(&ctx->uring_lock);
 	mutex_unlock(&ctx->uring_lock);
 
-	io_finish_async(ctx);
+	io_sq_thread_stop(ctx);
 	io_sqe_buffers_unregister(ctx);
 
 	if (ctx->sqo_task) {
@@ -8870,13 +8859,6 @@ static void io_ring_exit_work(struct work_struct *work)
 	io_ring_ctx_free(ctx);
 }
 
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-	return req->ctx == data;
-}
-
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -8895,9 +8877,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	io_kill_timeouts(ctx, NULL, NULL);
 	io_poll_remove_all(ctx, NULL, NULL);
 
-	if (ctx->io_wq)
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
 	/* if we failed setting up the ctx, we might not have any rings */
 	io_iopoll_try_reap_events(ctx);
 
@@ -8976,13 +8955,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct files_struct *files)
 {
 	struct io_task_cancel cancel = { .task = task, .files = files, };
+	struct io_uring_task *tctx = current->io_uring;
 
 	while (1) {
 		enum io_wq_cancel cret;
 		bool ret = false;
 
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+		if (tctx && tctx->io_wq) {
+			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
 					       &cancel, true);
 			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 		}
@@ -9094,7 +9074,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 	int ret;
 
 	if (unlikely(!tctx)) {
-		ret = io_uring_alloc_task_context(current);
+		ret = io_uring_alloc_task_context(current, ctx);
 		if (unlikely(ret))
 			return ret;
 		tctx = current->io_uring;
@@ -9164,8 +9144,12 @@ void __io_uring_files_cancel(struct files_struct *files)
 		io_uring_cancel_task_requests(file->private_data, files);
 	atomic_dec(&tctx->in_idle);
 
-	if (files)
+	if (files) {
 		io_uring_remove_task_files(tctx);
+	} else if (tctx->io_wq && current->flags & PF_EXITING) {
+		io_wq_destroy(tctx->io_wq);
+		tctx->io_wq = NULL;
+	}
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 2eb6d19de336..0e95398998b6 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -36,6 +36,7 @@ struct io_uring_task {
 	struct xarray		xa;
 	struct wait_queue_head	wait;
 	struct file		*last;
+	void			*io_wq;
 	struct percpu_counter	inflight;
 	struct io_identity	__identity;
 	struct io_identity	*identity;
-- 
cgit v1.2.3


From 3bfe6106693b6b4ba175ad1f929c4660b8f59ca8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Feb 2021 14:15:30 -0700
Subject: io-wq: fork worker threads from original task

Instead of using regular kthread kernel threads, create kernel threads
that are like a real thread that the task would create. This ensures that
we get all the context that we need, without having to carry that state
around. This greatly reduces the code complexity, and the risk of missing
state for a given request type.

With the move away from kthread, we can also dump everything related to
assigned state to the new threads.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c            | 301 +++++++++++++++++---------------------------------
 fs/io-wq.h            |   3 +-
 fs/io_uring.c         |   7 ++
 include/linux/sched.h |   3 +
 4 files changed, 116 insertions(+), 198 deletions(-)

(limited to 'include')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index ec7f1106b659..b53f569b5b4e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,12 +13,9 @@
 #include <linux/sched/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/kthread.h>
 #include <linux/rculist_nulls.h>
-#include <linux/fs_struct.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
 #include <linux/cpu.h>
+#include <linux/tracehook.h>
 
 #include "../kernel/sched/sched.h"
 #include "io-wq.h"
@@ -57,13 +54,6 @@ struct io_worker {
 	spinlock_t lock;
 
 	struct rcu_head rcu;
-	struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
-	struct cgroup_subsys_state *blkcg_css;
-#endif
-	const struct cred *cur_creds;
-	const struct cred *saved_creds;
-	struct nsproxy *restore_nsproxy;
 };
 
 #if BITS_PER_LONG == 64
@@ -122,6 +112,8 @@ struct io_wq {
 	struct completion done;
 
 	struct hlist_node cpuhp_node;
+
+	pid_t task_pid;
 };
 
 static enum cpuhp_state io_wq_online;
@@ -137,61 +129,6 @@ static void io_worker_release(struct io_worker *worker)
 		wake_up_process(worker->task);
 }
 
-/*
- * Note: drops the wqe->lock if returning true! The caller must re-acquire
- * the lock in that case. Some callers need to restart handling if this
- * happens, so we can't just re-acquire the lock on behalf of the caller.
- */
-static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
-{
-	bool dropped_lock = false;
-
-	if (worker->saved_creds) {
-		revert_creds(worker->saved_creds);
-		worker->cur_creds = worker->saved_creds = NULL;
-	}
-
-	if (current->files) {
-		__acquire(&wqe->lock);
-		raw_spin_unlock_irq(&wqe->lock);
-		dropped_lock = true;
-
-		task_lock(current);
-		current->files = NULL;
-		current->nsproxy = worker->restore_nsproxy;
-		task_unlock(current);
-	}
-
-	if (current->fs)
-		current->fs = NULL;
-
-	/*
-	 * If we have an active mm, we need to drop the wq lock before unusing
-	 * it. If we do, return true and let the caller retry the idle loop.
-	 */
-	if (worker->mm) {
-		if (!dropped_lock) {
-			__acquire(&wqe->lock);
-			raw_spin_unlock_irq(&wqe->lock);
-			dropped_lock = true;
-		}
-		__set_current_state(TASK_RUNNING);
-		kthread_unuse_mm(worker->mm);
-		mmput(worker->mm);
-		worker->mm = NULL;
-	}
-
-#ifdef CONFIG_BLK_CGROUP
-	if (worker->blkcg_css) {
-		kthread_associate_blkcg(NULL);
-		worker->blkcg_css = NULL;
-	}
-#endif
-	if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	return dropped_lock;
-}
-
 static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
 						   struct io_wq_work *work)
 {
@@ -237,10 +174,6 @@ static void io_worker_exit(struct io_worker *worker)
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
-	if (__io_worker_unuse(wqe, worker)) {
-		__release(&wqe->lock);
-		raw_spin_lock_irq(&wqe->lock);
-	}
 	acct->nr_workers--;
 	raw_spin_unlock_irq(&wqe->lock);
 
@@ -323,14 +256,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 
 static void io_worker_start(struct io_worker *worker)
 {
-	allow_kernel_signal(SIGINT);
-
-	current->flags |= PF_IO_WORKER;
-	current->fs = NULL;
-	current->files = NULL;
-
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-	worker->restore_nsproxy = current->nsproxy;
 	io_wqe_inc_running(worker);
 }
 
@@ -387,7 +313,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
 
-	return __io_worker_unuse(wqe, worker);
+	return false;
 }
 
 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -426,96 +352,23 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	return NULL;
 }
 
-static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
+static void io_flush_signals(void)
 {
-	if (worker->mm) {
-		kthread_unuse_mm(worker->mm);
-		mmput(worker->mm);
-		worker->mm = NULL;
+	if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+		if (current->task_works)
+			task_work_run();
+		clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
 	}
-
-	if (mmget_not_zero(work->identity->mm)) {
-		kthread_use_mm(work->identity->mm);
-		worker->mm = work->identity->mm;
-		return;
-	}
-
-	/* failed grabbing mm, ensure work gets cancelled */
-	work->flags |= IO_WQ_WORK_CANCEL;
-}
-
-static inline void io_wq_switch_blkcg(struct io_worker *worker,
-				      struct io_wq_work *work)
-{
-#ifdef CONFIG_BLK_CGROUP
-	if (!(work->flags & IO_WQ_WORK_BLKCG))
-		return;
-	if (work->identity->blkcg_css != worker->blkcg_css) {
-		kthread_associate_blkcg(work->identity->blkcg_css);
-		worker->blkcg_css = work->identity->blkcg_css;
-	}
-#endif
-}
-
-static void io_wq_switch_creds(struct io_worker *worker,
-			       struct io_wq_work *work)
-{
-	const struct cred *old_creds = override_creds(work->identity->creds);
-
-	worker->cur_creds = work->identity->creds;
-	if (worker->saved_creds)
-		put_cred(old_creds); /* creds set by previous switch */
-	else
-		worker->saved_creds = old_creds;
-}
-
-static void io_impersonate_work(struct io_worker *worker,
-				struct io_wq_work *work)
-{
-	if ((work->flags & IO_WQ_WORK_FILES) &&
-	    current->files != work->identity->files) {
-		task_lock(current);
-		current->files = work->identity->files;
-		current->nsproxy = work->identity->nsproxy;
-		task_unlock(current);
-		if (!work->identity->files) {
-			/* failed grabbing files, ensure work gets cancelled */
-			work->flags |= IO_WQ_WORK_CANCEL;
-		}
-	}
-	if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
-		current->fs = work->identity->fs;
-	if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
-		io_wq_switch_mm(worker, work);
-	if ((work->flags & IO_WQ_WORK_CREDS) &&
-	    worker->cur_creds != work->identity->creds)
-		io_wq_switch_creds(worker, work);
-	if (work->flags & IO_WQ_WORK_FSIZE)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
-	else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	io_wq_switch_blkcg(worker, work);
-#ifdef CONFIG_AUDIT
-	current->loginuid = work->identity->loginuid;
-	current->sessionid = work->identity->sessionid;
-#endif
 }
 
 static void io_assign_current_work(struct io_worker *worker,
 				   struct io_wq_work *work)
 {
 	if (work) {
-		/* flush pending signals before assigning new work */
-		if (signal_pending(current))
-			flush_signals(current);
+		io_flush_signals();
 		cond_resched();
 	}
 
-#ifdef CONFIG_AUDIT
-	current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
-	current->sessionid = AUDIT_SID_UNSET;
-#endif
-
 	spin_lock_irq(&worker->lock);
 	worker->cur_work = work;
 	spin_unlock_irq(&worker->lock);
@@ -556,7 +409,6 @@ get_next:
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
-			io_impersonate_work(worker, work);
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
@@ -608,10 +460,11 @@ loop:
 			goto loop;
 		}
 		raw_spin_unlock_irq(&wqe->lock);
-		if (signal_pending(current))
-			flush_signals(current);
+		io_flush_signals();
 		if (schedule_timeout(WORKER_IDLE_TIMEOUT))
 			continue;
+		if (fatal_signal_pending(current))
+			break;
 		/* timed out, exit unless we're the fixed worker */
 		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
 		    !(worker->flags & IO_WORKER_F_FIXED))
@@ -635,8 +488,10 @@ loop:
  */
 void io_wq_worker_running(struct task_struct *tsk)
 {
-	struct io_worker *worker = kthread_data(tsk);
+	struct io_worker *worker = tsk->pf_io_worker;
 
+	if (!worker)
+		return;
 	if (!(worker->flags & IO_WORKER_F_UP))
 		return;
 	if (worker->flags & IO_WORKER_F_RUNNING)
@@ -652,9 +507,10 @@ void io_wq_worker_running(struct task_struct *tsk)
  */
 void io_wq_worker_sleeping(struct task_struct *tsk)
 {
-	struct io_worker *worker = kthread_data(tsk);
-	struct io_wqe *wqe = worker->wqe;
+	struct io_worker *worker = tsk->pf_io_worker;
 
+	if (!worker)
+		return;
 	if (!(worker->flags & IO_WORKER_F_UP))
 		return;
 	if (!(worker->flags & IO_WORKER_F_RUNNING))
@@ -662,32 +518,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 
 	worker->flags &= ~IO_WORKER_F_RUNNING;
 
-	raw_spin_lock_irq(&wqe->lock);
+	raw_spin_lock_irq(&worker->wqe->lock);
 	io_wqe_dec_running(worker);
-	raw_spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock_irq(&worker->wqe->lock);
 }
 
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static int task_thread(void *data, int index)
 {
+	struct io_worker *worker = data;
+	struct io_wqe *wqe = worker->wqe;
 	struct io_wqe_acct *acct = &wqe->acct[index];
-	struct io_worker *worker;
+	struct io_wq *wq = wqe->wq;
+	char buf[TASK_COMM_LEN];
 
-	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
-	if (!worker)
-		return false;
+	sprintf(buf, "iou-wrk-%d", wq->task_pid);
+	set_task_comm(current, buf);
 
-	refcount_set(&worker->ref, 1);
-	worker->nulls_node.pprev = NULL;
-	worker->wqe = wqe;
-	spin_lock_init(&worker->lock);
+	current->pf_io_worker = worker;
+	worker->task = current;
 
-	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
-				"io_wqe_worker-%d/%d", index, wqe->node);
-	if (IS_ERR(worker->task)) {
-		kfree(worker);
-		return false;
-	}
-	kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
+	set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node));
+	current->flags |= PF_NO_SETAFFINITY;
 
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -703,8 +554,58 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	if (index == IO_WQ_ACCT_UNBOUND)
 		atomic_inc(&wq->user->processes);
 
+	io_wqe_worker(data);
+	do_exit(0);
+}
+
+static int task_thread_bound(void *data)
+{
+	return task_thread(data, IO_WQ_ACCT_BOUND);
+}
+
+static int task_thread_unbound(void *data)
+{
+	return task_thread(data, IO_WQ_ACCT_UNBOUND);
+}
+
+static pid_t fork_thread(int (*fn)(void *), void *arg)
+{
+	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+				CLONE_IO|SIGCHLD;
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+	};
+
+	return kernel_clone(&args);
+}
+
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+	struct io_worker *worker;
+	pid_t pid;
+
+	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+	if (!worker)
+		return false;
+
+	refcount_set(&worker->ref, 1);
+	worker->nulls_node.pprev = NULL;
+	worker->wqe = wqe;
+	spin_lock_init(&worker->lock);
+
+	if (index == IO_WQ_ACCT_BOUND)
+		pid = fork_thread(task_thread_bound, worker);
+	else
+		pid = fork_thread(task_thread_unbound, worker);
+	if (pid < 0) {
+		kfree(worker);
+		return false;
+	}
 	refcount_inc(&wq->refs);
-	wake_up_process(worker->task);
 	return true;
 }
 
@@ -756,12 +657,17 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 static int io_wq_manager(void *data)
 {
 	struct io_wq *wq = data;
+	char buf[TASK_COMM_LEN];
 	int node;
 
-	refcount_set(&wq->refs, 1);
+	sprintf(buf, "iou-mgr-%d", wq->task_pid);
+	set_task_comm(current, buf);
+	current->flags |= PF_IO_WORKER;
+	wq->manager = current;
+
 	complete(&wq->done);
 
-	while (!kthread_should_stop()) {
+	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		for_each_node(node) {
 			struct io_wqe *wqe = wq->wqes[node];
 			bool fork_worker[2] = { false, false };
@@ -782,11 +688,13 @@ static int io_wq_manager(void *data)
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ);
+		if (fatal_signal_pending(current))
+			set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	}
 
 	if (refcount_dec_and_test(&wq->refs)) {
 		complete(&wq->done);
-		return 0;
+		do_exit(0);
 	}
 	/* if ERROR is set and we get here, we have workers to wake */
 	if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
@@ -795,7 +703,7 @@ static int io_wq_manager(void *data)
 			io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
 		rcu_read_unlock();
 	}
-	return 0;
+	do_exit(0);
 }
 
 static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
@@ -919,7 +827,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work &&
 	    match->fn(worker->cur_work, match->data)) {
-		send_sig(SIGINT, worker->task, 1);
+		set_notify_signal(worker->task);
 		match->nr_running++;
 	}
 	spin_unlock_irqrestore(&worker->lock, flags);
@@ -1075,22 +983,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 		INIT_LIST_HEAD(&wqe->all_list);
 	}
 
+	wq->task_pid = current->pid;
 	init_completion(&wq->done);
+	refcount_set(&wq->refs, 1);
 
-	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
-	if (!IS_ERR(wq->manager)) {
-		wake_up_process(wq->manager);
+	current->flags |= PF_IO_WORKER;
+	ret = fork_thread(io_wq_manager, wq);
+	current->flags &= ~PF_IO_WORKER;
+	if (ret >= 0) {
 		wait_for_completion(&wq->done);
-		if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
-			ret = -ENOMEM;
-			goto err;
-		}
 		reinit_completion(&wq->done);
 		return wq;
 	}
 
-	ret = PTR_ERR(wq->manager);
-	complete(&wq->done);
+	if (refcount_dec_and_test(&wq->refs))
+		complete(&wq->done);
 err:
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	for_each_node(node)
@@ -1110,7 +1017,7 @@ void io_wq_destroy(struct io_wq *wq)
 
 	set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	if (wq->manager)
-		kthread_stop(wq->manager);
+		wake_up_process(wq->manager);
 
 	rcu_read_lock();
 	for_each_node(node)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index d2cf284b4641..83d56adabd16 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -137,6 +137,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
 
 static inline bool io_wq_current_is_worker(void)
 {
-	return in_task() && (current->flags & PF_IO_WORKER);
+	return in_task() && (current->flags & PF_IO_WORKER) &&
+		current->pf_io_worker;
 }
 #endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 31402a19fca6..9d22ec9d9406 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1633,6 +1633,9 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
 
+	BUG_ON(!tctx);
+	BUG_ON(!tctx->io_wq);
+
 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 					&req->work, req->flags);
 	io_wq_enqueue(tctx->io_wq, &req->work);
@@ -9240,6 +9243,10 @@ static int io_uring_flush(struct file *file, void *data)
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx = file->private_data;
 
+	/* Ignore helper thread files exit */
+	if (current->flags & PF_IO_WORKER)
+		return 0;
+
 	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
 		io_uring_cancel_task_requests(ctx, NULL);
 		io_req_caches_free(ctx, current);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26f499810dfa..ef00bb22164c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,9 @@ struct task_struct {
 	/* CLONE_CHILD_CLEARTID: */
 	int __user			*clear_child_tid;
 
+	/* PF_IO_WORKER */
+	void				*pf_io_worker;
+
 	u64				utime;
 	u64				stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-- 
cgit v1.2.3


From 4379bf8bd70b5de6bba7d53015b0c36c57a634ee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Feb 2021 13:40:22 -0700
Subject: io_uring: remove io_identity

We are no longer grabbing state, so no need to maintain an IO identity
that we COW if there are changes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c               |  26 ++++++++++++
 fs/io-wq.h               |   2 +-
 fs/io_uring.c            | 104 ++++++++++++-----------------------------------
 include/linux/io_uring.h |  19 ---------
 4 files changed, 52 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 41042119bf0f..acc67ed3a52c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -53,6 +53,9 @@ struct io_worker {
 	struct io_wq_work *cur_work;
 	spinlock_t lock;
 
+	const struct cred *cur_creds;
+	const struct cred *saved_creds;
+
 	struct rcu_head rcu;
 };
 
@@ -171,6 +174,11 @@ static void io_worker_exit(struct io_worker *worker)
 	worker->flags = 0;
 	preempt_enable();
 
+	if (worker->saved_creds) {
+		revert_creds(worker->saved_creds);
+		worker->cur_creds = worker->saved_creds = NULL;
+	}
+
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
@@ -312,6 +320,10 @@ static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 		worker->flags |= IO_WORKER_F_FREE;
 		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
+	if (worker->saved_creds) {
+		revert_creds(worker->saved_creds);
+		worker->cur_creds = worker->saved_creds = NULL;
+	}
 }
 
 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -359,6 +371,18 @@ static void io_flush_signals(void)
 	}
 }
 
+static void io_wq_switch_creds(struct io_worker *worker,
+			       struct io_wq_work *work)
+{
+	const struct cred *old_creds = override_creds(work->creds);
+
+	worker->cur_creds = work->creds;
+	if (worker->saved_creds)
+		put_cred(old_creds); /* creds set by previous switch */
+	else
+		worker->saved_creds = old_creds;
+}
+
 static void io_assign_current_work(struct io_worker *worker,
 				   struct io_wq_work *work)
 {
@@ -407,6 +431,8 @@ get_next:
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
+			if (work->creds && worker->cur_creds != work->creds)
+				io_wq_switch_creds(worker, work);
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bbe05dd54716..584f0bd5a83d 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -78,7 +78,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
 
 struct io_wq_work {
 	struct io_wq_work_node list;
-	struct io_identity *identity;
+	const struct cred *creds;
 	unsigned flags;
 };
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e88295758b5..6d851033e48d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1094,7 +1094,7 @@ static bool io_match_task(struct io_kiocb *head,
 			continue;
 		if (req->file && req->file->f_op == &io_uring_fops)
 			return true;
-		if (req->work.identity->files == files)
+		if (req->task->files == files)
 			return true;
 	}
 	return false;
@@ -1218,31 +1218,6 @@ static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
-	id->files = current->files;
-	id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
-	rcu_read_lock();
-	id->blkcg_css = blkcg_css();
-	rcu_read_unlock();
-#endif
-	id->creds = current_cred();
-	id->nsproxy = current->nsproxy;
-	id->fs = current->fs;
-	id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
-	id->loginuid = current->loginuid;
-	id->sessionid = current->sessionid;
-#endif
-	refcount_set(&id->count, 1);
-}
-
 static inline void __io_req_init_async(struct io_kiocb *req)
 {
 	memset(&req->work, 0, sizeof(req->work));
@@ -1255,17 +1230,10 @@ static inline void __io_req_init_async(struct io_kiocb *req)
  */
 static inline void io_req_init_async(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = current->io_uring;
-
 	if (req->flags & REQ_F_WORK_INITIALIZED)
 		return;
 
 	__io_req_init_async(req);
-
-	/* Grab a ref if this isn't our static identity */
-	req->work.identity = tctx->identity;
-	if (tctx->identity != &tctx->__identity)
-		refcount_inc(&req->work.identity->count);
 }
 
 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1350,19 +1318,15 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
-	if (req->work.identity == &tctx->__identity)
-		return;
-	if (refcount_dec_and_test(&req->work.identity->count))
-		kfree(req->work.identity);
-}
-
 static void io_req_clean_work(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;
 
+	if (req->work.creds) {
+		put_cred(req->work.creds);
+		req->work.creds = NULL;
+	}
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
 		struct io_uring_task *tctx = req->task->io_uring;
@@ -1377,7 +1341,6 @@ static void io_req_clean_work(struct io_kiocb *req)
 	}
 
 	req->flags &= ~REQ_F_WORK_INITIALIZED;
-	io_put_identity(req->task->io_uring, req);
 }
 
 static void io_req_track_inflight(struct io_kiocb *req)
@@ -1411,6 +1374,8 @@ static void io_prep_async_work(struct io_kiocb *req)
 		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -6376,9 +6341,9 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	const struct cred *old_creds = NULL;
 	int ret;
 
-	if ((req->flags & REQ_F_WORK_INITIALIZED) &&
-	    req->work.identity->creds != current_cred())
-		old_creds = override_creds(req->work.identity->creds);
+	if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+	    req->work.creds != current_cred())
+		old_creds = override_creds(req->work.creds);
 
 	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
@@ -6508,16 +6473,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		struct io_identity *iod;
-
-		iod = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!iod))
-			return -EINVAL;
-		refcount_inc(&iod->count);
-
 		__io_req_init_async(req);
-		get_cred(iod->creds);
-		req->work.identity = iod;
+		req->work.creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->work.creds))
+			return -EINVAL;
+		get_cred(req->work.creds);
 	}
 
 	state = &ctx->submit_state;
@@ -7936,8 +7896,6 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	tctx->last = NULL;
 	atomic_set(&tctx->in_idle, 0);
 	tctx->sqpoll = false;
-	io_init_identity(&tctx->__identity);
-	tctx->identity = &tctx->__identity;
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
@@ -7951,9 +7909,6 @@ void __io_uring_free(struct task_struct *tsk)
 	struct io_uring_task *tctx = tsk->io_uring;
 
 	WARN_ON_ONCE(!xa_empty(&tctx->xa));
-	WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
-	if (tctx->identity != &tctx->__identity)
-		kfree(tctx->identity);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -8593,13 +8548,11 @@ static int io_uring_fasync(int fd, struct file *file, int on)
 
 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 {
-	struct io_identity *iod;
+	const struct cred *creds;
 
-	iod = idr_remove(&ctx->personality_idr, id);
-	if (iod) {
-		put_cred(iod->creds);
-		if (refcount_dec_and_test(&iod->count))
-			kfree(iod);
+	creds = idr_remove(&ctx->personality_idr, id);
+	if (creds) {
+		put_cred(creds);
 		return 0;
 	}
 
@@ -9300,8 +9253,7 @@ out_fput:
 #ifdef CONFIG_PROC_FS
 static int io_uring_show_cred(int id, void *p, void *data)
 {
-	struct io_identity *iod = p;
-	const struct cred *cred = iod->creds;
+	const struct cred *cred = p;
 	struct seq_file *m = data;
 	struct user_namespace *uns = seq_user_ns(m);
 	struct group_info *gi;
@@ -9732,21 +9684,15 @@ out:
 
 static int io_register_personality(struct io_ring_ctx *ctx)
 {
-	struct io_identity *id;
+	const struct cred *creds;
 	int ret;
 
-	id = kmalloc(sizeof(*id), GFP_KERNEL);
-	if (unlikely(!id))
-		return -ENOMEM;
-
-	io_init_identity(id);
-	id->creds = get_current_cred();
+	creds = get_current_cred();
 
-	ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
-	if (ret < 0) {
-		put_cred(id->creds);
-		kfree(id);
-	}
+	ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+				USHRT_MAX, GFP_KERNEL);
+	if (ret < 0)
+		put_cred(creds);
 	return ret;
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 0e95398998b6..c48fcbdc2ea8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,23 +5,6 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
-struct io_identity {
-	struct files_struct		*files;
-	struct mm_struct		*mm;
-#ifdef CONFIG_BLK_CGROUP
-	struct cgroup_subsys_state	*blkcg_css;
-#endif
-	const struct cred		*creds;
-	struct nsproxy			*nsproxy;
-	struct fs_struct		*fs;
-	unsigned long			fsize;
-#ifdef CONFIG_AUDIT
-	kuid_t				loginuid;
-	unsigned int			sessionid;
-#endif
-	refcount_t			count;
-};
-
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -38,8 +21,6 @@ struct io_uring_task {
 	struct file		*last;
 	void			*io_wq;
 	struct percpu_counter	inflight;
-	struct io_identity	__identity;
-	struct io_identity	*identity;
 	atomic_t		in_idle;
 	bool			sqpoll;
 
-- 
cgit v1.2.3


From 2596b6ae412be3d29632efc63976a2132032e620 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 19 Feb 2021 14:51:42 -0500
Subject: kexec: move machine_kexec_post_load() to public interface

The kernel test robot reports the following compiler warning:

  | arch/arm64/kernel/machine_kexec.c:62:5: warning: no previous prototype for
  | function 'machine_kexec_post_load' [-Wmissing-prototypes]
  |    int machine_kexec_post_load(struct kimage *kimage)

Fix it by moving the declaration of machine_kexec_post_load() from
kexec_internal.h to the public header instead.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/linux-arm-kernel/202102030727.gqTokACH-lkp@intel.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Link: https://lore.kernel.org/r/20210219195142.13571-1-pasha.tatashin@soleen.com
Fixes: 4c3c31230c91 ("arm64: kexec: move relocation function setup")
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/kexec.h   | 2 ++
 kernel/kexec_internal.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e93bef52968..3671b845cf28 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -309,6 +309,8 @@ extern void machine_kexec_cleanup(struct kimage *image);
 extern int kernel_kexec(void);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
+int machine_kexec_post_load(struct kimage *image);
+
 extern void __crash_kexec(struct pt_regs *);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 39d30ccf8d87..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image);
 int kimage_is_destination_range(struct kimage *image,
 				unsigned long start, unsigned long end);
 
-int machine_kexec_post_load(struct kimage *image);
-
 extern struct mutex kexec_mutex;
 
 #ifdef CONFIG_KEXEC_FILE
-- 
cgit v1.2.3


From 9fb407179c6fd910005040bebb040094ef959b6c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 21 Feb 2021 18:28:05 -0800
Subject: block: Remove unused blk_pm_*() function definitions

Commit a1ce35fa4985 ("block: remove dead elevator code") removed the last
callers of blk_pm_requeue_request(), blk_pm_add_request() and
blk_pm_put_request(). Hence remove the definitions of these functions.
Removing these functions removes all users of the struct request nr_pending
member. Hence also remove 'nr_pending'. Note: 'nr_pending' is no longer
used since commit 7cedffec8e75 ("block: Make blk_get_request() block for
non-PM requests while suspended").

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-pm.h         | 38 --------------------------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 39 deletions(-)

(limited to 'include')

diff --git a/block/blk-pm.h b/block/blk-pm.h
index a2283cc9f716..8a5a0d4b357f 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,31 +21,6 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
 	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		pm_runtime_mark_last_busy(rq->q->dev);
 }
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-	lockdep_assert_held(&rq->q->queue_lock);
-
-	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
-		rq->q->nr_pending--;
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
-				      struct request *rq)
-{
-	lockdep_assert_held(&q->queue_lock);
-
-	if (q->dev && !(rq->rq_flags & RQF_PM))
-		q->nr_pending++;
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-	lockdep_assert_held(&rq->q->queue_lock);
-
-	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
-		--rq->q->nr_pending;
-}
 #else
 static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 {
@@ -55,19 +30,6 @@ static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 static inline void blk_pm_mark_last_busy(struct request *rq)
 {
 }
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
-				      struct request *rq)
-{
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-}
 #endif
 
 #endif /* _BLOCK_BLK_PM_H_ */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9149f4a5adb3..f2e0697258b8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -462,7 +462,6 @@ struct request_queue {
 #ifdef CONFIG_PM
 	struct device		*dev;
 	enum rpm_status		rpm_status;
-	unsigned int		nr_pending;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 179d1600723670dc0d6ae8ce572e0e2c44b64763 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:55 -0800
Subject: block: remove superfluous param in blk_fill_rwbs()

The last parameter for the function blk_fill_rwbs() was added in
5782138e47 ("tracing/events: convert block trace points to
TRACE_EVENT()") in order to signal read request and use of that parameter
was replaced with using switch case REQ_OP_READ with
1b9a9ab78b0 ("blktrace: use op accessors"), but the parameter was never
removed.

Remove the unused parameter and adjust the respective call sites.

Fixes: 1b9a9ab78b0 ("blktrace: use op accessors")
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h  |  2 +-
 include/trace/events/bcache.h | 10 +++++-----
 include/trace/events/block.h  | 16 ++++++++--------
 kernel/trace/blktrace.c       |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 05556573b896..11484f1d19a1 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
+extern void blk_fill_rwbs(char *rwbs, unsigned int op);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e41c611d6d3b..899fdacf57b9 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
@@ -238,7 +238,7 @@ TRACE_EVENT(bcache_journal_write,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		__entry->nr_keys	= keys;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u keys %u",
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 0d782663a005..879cba8bdfca 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -89,7 +89,7 @@ TRACE_EVENT(block_rq_requeue,
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 	),
 
@@ -133,7 +133,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->error     = error;
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 	),
 
@@ -166,7 +166,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 		__entry->bytes     = blk_rq_bytes(rq);
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -249,7 +249,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= blk_status_to_errno(bio->bi_status);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -276,7 +276,7 @@ DECLARE_EVENT_CLASS(block_bio,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -433,7 +433,7 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -474,7 +474,7 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -518,7 +518,7 @@ TRACE_EVENT(block_rq_remap,
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		__entry->nr_bios	= blk_rq_count_bios(rq);
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e9ee4945043..8a2591c7aa41 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1867,7 +1867,7 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+void blk_fill_rwbs(char *rwbs, unsigned int op)
 {
 	int i = 0;
 
-- 
cgit v1.2.3


From 1f83bb4b491472310ae7aeca505ed3725149906c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:56 -0800
Subject: blktrace: add blk_fill_rwbs documentation comment

blk_fill_rwbs() is an expoted function, add kernel style documentation
comment.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h |  2 +-
 kernel/trace/blktrace.c      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 11484f1d19a1..e17d04abf6a3 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-extern void blk_fill_rwbs(char *rwbs, unsigned int op);
+void blk_fill_rwbs(char *rwbs, unsigned int op);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8a2591c7aa41..d7ebef83771c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1867,6 +1867,16 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs	buffer to be filled
+ * @op:		REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ *     Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ *     caller with resulting string.
+ *
+ **/
 void blk_fill_rwbs(char *rwbs, unsigned int op)
 {
 	int i = 0;
-- 
cgit v1.2.3


From c7ff651960a6ef11cef55479658aff504c34872f Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:57 -0800
Subject: blktrace: fix blk_rq_issue documentation

The commit 881245dcff29 ("Add DocBook documentation for the block tracepoints.")
added the comment for blk_rq_issue() tracepoint. Remove the duplicate
word from the tracepoint documentation.

Fixes: 881245dcff29 ("Add DocBook documentation for the block tracepoints.")
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 879cba8bdfca..004cfe34ef37 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -196,7 +196,7 @@ DEFINE_EVENT(block_rq, block_rq_insert,
 
 /**
  * block_rq_issue - issue pending block IO request operation to device driver
- * @rq: block IO operation operation request
+ * @rq: block IO operation request
  *
  * Called when block operation request @rq from queue @q is sent to a
  * device driver for processing.
-- 
cgit v1.2.3


From b0719245098c27b36a9b52969af0300ae6219591 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:58 -0800
Subject: blktrace: fix blk_rq_merge documentation

The commit f3bdc62fd82e ("blktrace: Provide event for request merging")
added the comment for blk_rq_merge() tracepoint. Remove the duplicate
word from the tracepoint documentation.

Fixes: f3bdc62fd82e ("blktrace: Provide event for request merging")
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 004cfe34ef37..cc5ab96a7471 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -210,7 +210,7 @@ DEFINE_EVENT(block_rq, block_rq_issue,
 
 /**
  * block_rq_merge - merge request with another one in the elevator
- * @rq: block IO operation operation request
+ * @rq: block IO operation request
  *
  * Called when block operation request @rq from queue @q is merged to another
  * request queued in the elevator.
-- 
cgit v1.2.3


From 4a42d848db9544e3108875390886dc490d9c101e Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Mon, 22 Feb 2021 11:45:22 +0900
Subject: KVM: x86/mmu: Consider the hva in mmu_notifier retry

Track the range being invalidated by mmu_notifier and skip page fault
retries if the fault address is not affected by the in-progress
invalidation. Handle concurrent invalidations by finding the minimal
range which includes all ranges being invalidated. Although the combined
range may include unrelated addresses and cannot be shrunk as individual
invalidation operations complete, it is unlikely the marginal gains of
proper range tracking are worth the additional complexity.

The primary benefit of this change is the reduction in the likelihood of
extreme latency when handing a page fault due to another thread having
been preempted while modifying host virtual addresses.

Signed-off-by: David Stevens <stevensd@chromium.org>
Message-Id: <20210222024522.1751719-3-stevensd@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
 arch/x86/kvm/mmu/mmu.c                 | 23 +++++++++++++++++------
 arch/x86/kvm/mmu/paging_tmpl.h         | 14 +++++++++++---
 include/linux/kvm_host.h               | 25 ++++++++++++++++++++++++-
 virt/kvm/kvm_main.c                    | 29 +++++++++++++++++++++++++----
 6 files changed, 79 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 38ea396a23d6..8e06cd3f759c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 	} else {
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, &write_ok);
+					   writing, &write_ok, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index bb35490400e9..e603de7ade52 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, upgrade_p);
+					   writing, upgrade_p, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 752b4b7ab01b..d75524bc8423 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2734,6 +2734,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
+	/*
+	 * If addresses are being invalidated, skip prefetching to avoid
+	 * accidentally prefetching those addresses.
+	 */
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return;
+
 	__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
@@ -3640,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
-			 bool *writable)
+			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+			 bool write, bool *writable)
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	bool async;
@@ -3654,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	}
 
 	async = false;
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+				    write, writable, hva);
 	if (!async)
 		return false; /* *pfn has correct page already */
 
@@ -3668,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+				    write, writable, hva);
 	return false;
 }
 
@@ -3681,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	int r;
 
 	if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3699,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+			 write, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
@@ -3712,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	else
 		write_lock(&vcpu->kvm->mmu_lock);
 
-	if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 	r = make_mmu_pages_available(vcpu);
 	if (r)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5844d3979bb8..55d7b473ac44 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
+	/*
+	 * If addresses are being invalidated, skip prefetching to avoid
+	 * accidentally prefetching those addresses.
+	 */
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return;
+
 	if (sp->role.direct)
 		return __direct_pte_prefetch(vcpu, sp, sptep);
 
@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	struct guest_walker walker;
 	int r;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 	int max_level;
@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-			 &map_writable))
+	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+			 write_fault, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 
 	r = RET_PF_RETRY;
 	write_lock(&vcpu->kvm->mmu_lock);
-	if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e126ebda36d0..1b65e7204344 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -11,6 +11,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/bug.h>
+#include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/preempt.h>
@@ -506,6 +507,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
+	unsigned long mmu_notifier_range_start;
+	unsigned long mmu_notifier_range_end;
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
@@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable);
+			       bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 		return 1;
 	return 0;
 }
+
+static inline int mmu_notifier_retry_hva(struct kvm *kvm,
+					 unsigned long mmu_seq,
+					 unsigned long hva)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+	/*
+	 * If mmu_notifier_count is non-zero, then the range maintained by
+	 * kvm_mmu_notifier_invalidate_range_start contains all addresses that
+	 * might be being invalidated. Note that it may include some false
+	 * positives, due to shortcuts when handing concurrent invalidations.
+	 */
+	if (unlikely(kvm->mmu_notifier_count) &&
+	    hva >= kvm->mmu_notifier_range_start &&
+	    hva < kvm->mmu_notifier_range_end)
+		return 1;
+	if (kvm->mmu_notifier_seq != mmu_seq)
+		return 1;
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 001b9de4e727..383df23514b9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
+	if (likely(kvm->mmu_notifier_count == 1)) {
+		kvm->mmu_notifier_range_start = range->start;
+		kvm->mmu_notifier_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has dimishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_notifier_range_start =
+			min(kvm->mmu_notifier_range_start, range->start);
+		kvm->mmu_notifier_range_end =
+			max(kvm->mmu_notifier_range_end, range->end);
+	}
 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
 					     range->flags);
 	/* we've to flush the tlb before the pages can be freed */
@@ -2023,10 +2041,13 @@ exit:
 
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable)
+			       bool *writable, hva_t *hva)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
+	if (hva)
+		*hva = addr;
+
 	if (addr == KVM_HVA_ERR_RO_BAD) {
 		if (writable)
 			*writable = false;
@@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-				    write_fault, writable);
+				    write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-- 
cgit v1.2.3


From c6ca7616f7d5c2ce166280107ba74db1d528fcb7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Feb 2021 14:02:14 +0900
Subject: clk: Add RISC-V Canaan Kendryte K210 clock driver

Add a clock provider driver for the Canaan Kendryte K210 RISC-V SoC.
This new driver with the compatible string "canaan,k210-clk" implements
support for the full clock structure of the K210 SoC. Since it is
required for the correct operation of the SoC, this driver is
selected by default for compilation when the SOC_CANAAN option is
selected.

With this change, the k210-sysctl driver is turned into a simple
platform driver which enables its power bus clock and triggers
populating its child nodes. The sysctl driver retains the SOC early
initialization code, but the implementation now relies on the new
function k210_clk_early_init() provided by the new clk-k210 driver.

The clock structure implemented and many of the coding ideas for the
driver come from the work by Sean Anderson on the K210 support for the
U-Boot project.

Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: linux-clk@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 drivers/clk/Kconfig              |    7 +
 drivers/clk/Makefile             |    1 +
 drivers/clk/clk-k210.c           | 1007 ++++++++++++++++++++++++++++++++++++++
 drivers/soc/canaan/Kconfig       |   18 +-
 drivers/soc/canaan/Makefile      |    2 +-
 drivers/soc/canaan/k210-sysctl.c |  205 ++------
 include/soc/canaan/k210-sysctl.h |    2 +
 7 files changed, 1064 insertions(+), 178 deletions(-)
 create mode 100644 drivers/clk/clk-k210.c

(limited to 'include')

diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index 85856cff506c..78f6288d5226 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -368,6 +368,13 @@ config COMMON_CLK_FIXED_MMIO
 	help
 	  Support for Memory Mapped IO Fixed clocks
 
+config COMMON_CLK_K210
+	bool "Clock driver for the Canaan Kendryte K210 SoC"
+	depends on OF && RISCV && SOC_CANAAN
+	default SOC_CANAAN
+	help
+	  Support for the Canaan Kendryte K210 RISC-V SoC clocks.
+
 source "drivers/clk/actions/Kconfig"
 source "drivers/clk/analogbits/Kconfig"
 source "drivers/clk/baikal-t1/Kconfig"
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index dbdc590e7de3..5f8f9f135df5 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_COMMON_CLK_ASPEED)		+= clk-aspeed.o
 obj-$(CONFIG_MACH_ASPEED_G6)		+= clk-ast2600.o
 obj-$(CONFIG_ARCH_HIGHBANK)		+= clk-highbank.o
 obj-$(CONFIG_CLK_HSDK)			+= clk-hsdk-pll.o
+obj-$(CONFIG_COMMON_CLK_K210)		+= clk-k210.o
 obj-$(CONFIG_COMMON_CLK_LOCHNAGAR)	+= clk-lochnagar.o
 obj-$(CONFIG_COMMON_CLK_MAX77686)	+= clk-max77686.o
 obj-$(CONFIG_COMMON_CLK_MAX9485)	+= clk-max9485.o
diff --git a/drivers/clk/clk-k210.c b/drivers/clk/clk-k210.c
new file mode 100644
index 000000000000..6c84abf5b2e3
--- /dev/null
+++ b/drivers/clk/clk-k210.c
@@ -0,0 +1,1007 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2019 Western Digital Corporation or its affiliates.
+ */
+#define pr_fmt(fmt)     "k210-clk: " fmt
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_clk.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/clk-provider.h>
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <soc/canaan/k210-sysctl.h>
+
+#include <dt-bindings/clock/k210-clk.h>
+
+struct k210_sysclk;
+
+struct k210_clk {
+	int id;
+	struct k210_sysclk *ksc;
+	struct clk_hw hw;
+};
+
+struct k210_clk_cfg {
+	const char *name;
+	u8 gate_reg;
+	u8 gate_bit;
+	u8 div_reg;
+	u8 div_shift;
+	u8 div_width;
+	u8 div_type;
+	u8 mux_reg;
+	u8 mux_bit;
+};
+
+enum k210_clk_div_type {
+	K210_DIV_NONE,
+	K210_DIV_ONE_BASED,
+	K210_DIV_DOUBLE_ONE_BASED,
+	K210_DIV_POWER_OF_TWO,
+};
+
+#define K210_GATE(_reg, _bit)	\
+	.gate_reg = (_reg),	\
+	.gate_bit = (_bit)
+
+#define K210_DIV(_reg, _shift, _width, _type)	\
+	.div_reg = (_reg),			\
+	.div_shift = (_shift),			\
+	.div_width = (_width),			\
+	.div_type = (_type)
+
+#define K210_MUX(_reg, _bit)	\
+	.mux_reg = (_reg),	\
+	.mux_bit = (_bit)
+
+static struct k210_clk_cfg k210_clk_cfgs[K210_NUM_CLKS] = {
+	/* Gated clocks, no mux, no divider */
+	[K210_CLK_CPU] = {
+		.name = "cpu",
+		K210_GATE(K210_SYSCTL_EN_CENT, 0)
+	},
+	[K210_CLK_DMA] = {
+		.name = "dma",
+		K210_GATE(K210_SYSCTL_EN_PERI, 1)
+	},
+	[K210_CLK_FFT] = {
+		.name = "fft",
+		K210_GATE(K210_SYSCTL_EN_PERI, 4)
+	},
+	[K210_CLK_GPIO] = {
+		.name = "gpio",
+		K210_GATE(K210_SYSCTL_EN_PERI, 5)
+	},
+	[K210_CLK_UART1] = {
+		.name = "uart1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 16)
+	},
+	[K210_CLK_UART2] = {
+		.name = "uart2",
+		K210_GATE(K210_SYSCTL_EN_PERI, 17)
+	},
+	[K210_CLK_UART3] = {
+		.name = "uart3",
+		K210_GATE(K210_SYSCTL_EN_PERI, 18)
+	},
+	[K210_CLK_FPIOA] = {
+		.name = "fpioa",
+		K210_GATE(K210_SYSCTL_EN_PERI, 20)
+	},
+	[K210_CLK_SHA] = {
+		.name = "sha",
+		K210_GATE(K210_SYSCTL_EN_PERI, 26)
+	},
+	[K210_CLK_AES] = {
+		.name = "aes",
+		K210_GATE(K210_SYSCTL_EN_PERI, 19)
+	},
+	[K210_CLK_OTP] = {
+		.name = "otp",
+		K210_GATE(K210_SYSCTL_EN_PERI, 27)
+	},
+	[K210_CLK_RTC] = {
+		.name = "rtc",
+		K210_GATE(K210_SYSCTL_EN_PERI, 29)
+	},
+
+	/* Gated divider clocks */
+	[K210_CLK_SRAM0] = {
+		.name = "sram0",
+		K210_GATE(K210_SYSCTL_EN_CENT, 1),
+		K210_DIV(K210_SYSCTL_THR0, 0, 4, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_SRAM1] = {
+		.name = "sram1",
+		K210_GATE(K210_SYSCTL_EN_CENT, 2),
+		K210_DIV(K210_SYSCTL_THR0, 4, 4, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_ROM] = {
+		.name = "rom",
+		K210_GATE(K210_SYSCTL_EN_PERI, 0),
+		K210_DIV(K210_SYSCTL_THR0, 16, 4, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_DVP] = {
+		.name = "dvp",
+		K210_GATE(K210_SYSCTL_EN_PERI, 3),
+		K210_DIV(K210_SYSCTL_THR0, 12, 4, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_APB0] = {
+		.name = "apb0",
+		K210_GATE(K210_SYSCTL_EN_CENT, 3),
+		K210_DIV(K210_SYSCTL_SEL0, 3, 3, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_APB1] = {
+		.name = "apb1",
+		K210_GATE(K210_SYSCTL_EN_CENT, 4),
+		K210_DIV(K210_SYSCTL_SEL0, 6, 3, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_APB2] = {
+		.name = "apb2",
+		K210_GATE(K210_SYSCTL_EN_CENT, 5),
+		K210_DIV(K210_SYSCTL_SEL0, 9, 3, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_AI] = {
+		.name = "ai",
+		K210_GATE(K210_SYSCTL_EN_PERI, 2),
+		K210_DIV(K210_SYSCTL_THR0, 8, 4, K210_DIV_ONE_BASED)
+	},
+	[K210_CLK_SPI0] = {
+		.name = "spi0",
+		K210_GATE(K210_SYSCTL_EN_PERI, 6),
+		K210_DIV(K210_SYSCTL_THR1, 0, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_SPI1] = {
+		.name = "spi1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 7),
+		K210_DIV(K210_SYSCTL_THR1, 8, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_SPI2] = {
+		.name = "spi2",
+		K210_GATE(K210_SYSCTL_EN_PERI, 8),
+		K210_DIV(K210_SYSCTL_THR1, 16, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2C0] = {
+		.name = "i2c0",
+		K210_GATE(K210_SYSCTL_EN_PERI, 13),
+		K210_DIV(K210_SYSCTL_THR5, 8, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2C1] = {
+		.name = "i2c1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 14),
+		K210_DIV(K210_SYSCTL_THR5, 16, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2C2] = {
+		.name = "i2c2",
+		K210_GATE(K210_SYSCTL_EN_PERI, 15),
+		K210_DIV(K210_SYSCTL_THR5, 24, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_WDT0] = {
+		.name = "wdt0",
+		K210_GATE(K210_SYSCTL_EN_PERI, 24),
+		K210_DIV(K210_SYSCTL_THR6, 0, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_WDT1] = {
+		.name = "wdt1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 25),
+		K210_DIV(K210_SYSCTL_THR6, 8, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2S0] = {
+		.name = "i2s0",
+		K210_GATE(K210_SYSCTL_EN_PERI, 10),
+		K210_DIV(K210_SYSCTL_THR3, 0, 16, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2S1] = {
+		.name = "i2s1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 11),
+		K210_DIV(K210_SYSCTL_THR3, 16, 16, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2S2] = {
+		.name = "i2s2",
+		K210_GATE(K210_SYSCTL_EN_PERI, 12),
+		K210_DIV(K210_SYSCTL_THR4, 0, 16, K210_DIV_DOUBLE_ONE_BASED)
+	},
+
+	/* Divider clocks, no gate, no mux */
+	[K210_CLK_I2S0_M] = {
+		.name = "i2s0_m",
+		K210_DIV(K210_SYSCTL_THR4, 16, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2S1_M] = {
+		.name = "i2s1_m",
+		K210_DIV(K210_SYSCTL_THR4, 24, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+	[K210_CLK_I2S2_M] = {
+		.name = "i2s2_m",
+		K210_DIV(K210_SYSCTL_THR4, 0, 8, K210_DIV_DOUBLE_ONE_BASED)
+	},
+
+	/* Muxed gated divider clocks */
+	[K210_CLK_SPI3] = {
+		.name = "spi3",
+		K210_GATE(K210_SYSCTL_EN_PERI, 9),
+		K210_DIV(K210_SYSCTL_THR1, 24, 8, K210_DIV_DOUBLE_ONE_BASED),
+		K210_MUX(K210_SYSCTL_SEL0, 12)
+	},
+	[K210_CLK_TIMER0] = {
+		.name = "timer0",
+		K210_GATE(K210_SYSCTL_EN_PERI, 21),
+		K210_DIV(K210_SYSCTL_THR2,  0, 8, K210_DIV_DOUBLE_ONE_BASED),
+		K210_MUX(K210_SYSCTL_SEL0, 13)
+	},
+	[K210_CLK_TIMER1] = {
+		.name = "timer1",
+		K210_GATE(K210_SYSCTL_EN_PERI, 22),
+		K210_DIV(K210_SYSCTL_THR2, 8, 8, K210_DIV_DOUBLE_ONE_BASED),
+		K210_MUX(K210_SYSCTL_SEL0, 14)
+	},
+	[K210_CLK_TIMER2] = {
+		.name = "timer2",
+		K210_GATE(K210_SYSCTL_EN_PERI, 23),
+		K210_DIV(K210_SYSCTL_THR2, 16, 8, K210_DIV_DOUBLE_ONE_BASED),
+		K210_MUX(K210_SYSCTL_SEL0, 15)
+	},
+};
+
+/*
+ * PLL control register bits.
+ */
+#define K210_PLL_CLKR		GENMASK(3, 0)
+#define K210_PLL_CLKF		GENMASK(9, 4)
+#define K210_PLL_CLKOD		GENMASK(13, 10)
+#define K210_PLL_BWADJ		GENMASK(19, 14)
+#define K210_PLL_RESET		(1 << 20)
+#define K210_PLL_PWRD		(1 << 21)
+#define K210_PLL_INTFB		(1 << 22)
+#define K210_PLL_BYPASS		(1 << 23)
+#define K210_PLL_TEST		(1 << 24)
+#define K210_PLL_EN		(1 << 25)
+#define K210_PLL_SEL		GENMASK(27, 26) /* PLL2 only */
+
+/*
+ * PLL lock register bits.
+ */
+#define K210_PLL_LOCK		0
+#define K210_PLL_CLEAR_SLIP	2
+#define K210_PLL_TEST_OUT	3
+
+/*
+ * Clock selector register bits.
+ */
+#define K210_ACLK_SEL		BIT(0)
+#define K210_ACLK_DIV		GENMASK(2, 1)
+
+/*
+ * PLLs.
+ */
+enum k210_pll_id {
+	K210_PLL0, K210_PLL1, K210_PLL2, K210_PLL_NUM
+};
+
+struct k210_pll {
+	enum k210_pll_id id;
+	struct k210_sysclk *ksc;
+	void __iomem *base;
+	void __iomem *reg;
+	void __iomem *lock;
+	u8 lock_shift;
+	u8 lock_width;
+	struct clk_hw hw;
+};
+#define to_k210_pll(_hw)	container_of(_hw, struct k210_pll, hw)
+
+/*
+ * PLLs configuration: by default PLL0 runs at 780 MHz and PLL1 at 299 MHz.
+ * The first 2 SRAM banks depend on ACLK/CPU clock which is by default PLL0
+ * rate divided by 2. Set PLL1 to 390 MHz so that the third SRAM bank has the
+ * same clock as the first 2.
+ */
+struct k210_pll_cfg {
+	u32 reg;
+	u8 lock_shift;
+	u8 lock_width;
+	u32 r;
+	u32 f;
+	u32 od;
+	u32 bwadj;
+};
+
+static struct k210_pll_cfg k210_plls_cfg[] = {
+	{ K210_SYSCTL_PLL0,  0, 2, 0, 59, 1, 59 }, /* 780 MHz */
+	{ K210_SYSCTL_PLL1,  8, 1, 0, 59, 3, 59 }, /* 390 MHz */
+	{ K210_SYSCTL_PLL2, 16, 1, 0, 22, 1, 22 }, /* 299 MHz */
+};
+
+/**
+ * struct k210_sysclk - sysclk driver data
+ * @regs: system controller registers start address
+ * @clk_lock: clock setting spinlock
+ * @plls: SoC PLLs descriptors
+ * @aclk: ACLK clock
+ * @clks: All other clocks
+ */
+struct k210_sysclk {
+	void __iomem			*regs;
+	spinlock_t			clk_lock;
+	struct k210_pll			plls[K210_PLL_NUM];
+	struct clk_hw			aclk;
+	struct k210_clk			clks[K210_NUM_CLKS];
+};
+
+#define to_k210_sysclk(_hw)	container_of(_hw, struct k210_sysclk, aclk)
+
+/*
+ * Set ACLK parent selector: 0 for IN0, 1 for PLL0.
+ */
+static void k210_aclk_set_selector(void __iomem *regs, u8 sel)
+{
+	u32 reg = readl(regs + K210_SYSCTL_SEL0);
+
+	if (sel)
+		reg |= K210_ACLK_SEL;
+	else
+		reg &= K210_ACLK_SEL;
+	writel(reg, regs + K210_SYSCTL_SEL0);
+}
+
+static void k210_init_pll(void __iomem *regs, enum k210_pll_id pllid,
+			  struct k210_pll *pll)
+{
+	pll->id = pllid;
+	pll->reg = regs + k210_plls_cfg[pllid].reg;
+	pll->lock = regs + K210_SYSCTL_PLL_LOCK;
+	pll->lock_shift = k210_plls_cfg[pllid].lock_shift;
+	pll->lock_width = k210_plls_cfg[pllid].lock_width;
+}
+
+static void k210_pll_wait_for_lock(struct k210_pll *pll)
+{
+	u32 reg, mask = GENMASK(pll->lock_shift + pll->lock_width - 1,
+				pll->lock_shift);
+
+	while (true) {
+		reg = readl(pll->lock);
+		if ((reg & mask) == mask)
+			break;
+
+		reg |= BIT(pll->lock_shift + K210_PLL_CLEAR_SLIP);
+		writel(reg, pll->lock);
+	}
+}
+
+static bool k210_pll_hw_is_enabled(struct k210_pll *pll)
+{
+	u32 reg = readl(pll->reg);
+	u32 mask = K210_PLL_PWRD | K210_PLL_EN;
+
+	if (reg & K210_PLL_RESET)
+		return false;
+
+	return (reg & mask) == mask;
+}
+
+static void k210_pll_enable_hw(void __iomem *regs, struct k210_pll *pll)
+{
+	struct k210_pll_cfg *pll_cfg = &k210_plls_cfg[pll->id];
+	u32 reg;
+
+	if (k210_pll_hw_is_enabled(pll))
+		return;
+
+	/*
+	 * For PLL0, we need to re-parent ACLK to IN0 to keep the CPU cores and
+	 * SRAM running.
+	 */
+	if (pll->id == K210_PLL0)
+		k210_aclk_set_selector(regs, 0);
+
+	/* Set PLL factors */
+	reg = readl(pll->reg);
+	reg &= ~GENMASK(19, 0);
+	reg |= FIELD_PREP(K210_PLL_CLKR, pll_cfg->r);
+	reg |= FIELD_PREP(K210_PLL_CLKF, pll_cfg->f);
+	reg |= FIELD_PREP(K210_PLL_CLKOD, pll_cfg->od);
+	reg |= FIELD_PREP(K210_PLL_BWADJ, pll_cfg->bwadj);
+	reg |= K210_PLL_PWRD;
+	writel(reg, pll->reg);
+
+	/*
+	 * Reset the PLL: ensure reset is low before asserting it.
+	 * The magic NOPs come from the Kendryte reference SDK.
+	 */
+	reg &= ~K210_PLL_RESET;
+	writel(reg, pll->reg);
+	reg |= K210_PLL_RESET;
+	writel(reg, pll->reg);
+	nop();
+	nop();
+	reg &= ~K210_PLL_RESET;
+	writel(reg, pll->reg);
+
+	k210_pll_wait_for_lock(pll);
+
+	reg &= ~K210_PLL_BYPASS;
+	reg |= K210_PLL_EN;
+	writel(reg, pll->reg);
+
+	if (pll->id == K210_PLL0)
+		k210_aclk_set_selector(regs, 1);
+}
+
+static int k210_pll_enable(struct clk_hw *hw)
+{
+	struct k210_pll *pll = to_k210_pll(hw);
+	struct k210_sysclk *ksc = pll->ksc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+
+	k210_pll_enable_hw(ksc->regs, pll);
+
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return 0;
+}
+
+static void k210_pll_disable(struct clk_hw *hw)
+{
+	struct k210_pll *pll = to_k210_pll(hw);
+	struct k210_sysclk *ksc = pll->ksc;
+	unsigned long flags;
+	u32 reg;
+
+	/*
+	 * Bypassing before powering off is important so child clocks do not
+	 * stop working. This is especially important for pll0, the indirect
+	 * parent of the cpu clock.
+	 */
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+	reg = readl(pll->reg);
+	reg |= K210_PLL_BYPASS;
+	writel(reg, pll->reg);
+
+	reg &= ~K210_PLL_PWRD;
+	reg &= ~K210_PLL_EN;
+	writel(reg, pll->reg);
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+}
+
+static int k210_pll_is_enabled(struct clk_hw *hw)
+{
+	return k210_pll_hw_is_enabled(to_k210_pll(hw));
+}
+
+static unsigned long k210_pll_get_rate(struct clk_hw *hw,
+				       unsigned long parent_rate)
+{
+	struct k210_pll *pll = to_k210_pll(hw);
+	u32 reg = readl(pll->reg);
+	u32 r, f, od;
+
+	if (reg & K210_PLL_BYPASS)
+		return parent_rate;
+
+	if (!(reg & K210_PLL_PWRD))
+		return 0;
+
+	r = FIELD_GET(K210_PLL_CLKR, reg) + 1;
+	f = FIELD_GET(K210_PLL_CLKF, reg) + 1;
+	od = FIELD_GET(K210_PLL_CLKOD, reg) + 1;
+
+	return (u64)parent_rate * f / (r * od);
+}
+
+static const struct clk_ops k210_pll_ops = {
+	.enable		= k210_pll_enable,
+	.disable	= k210_pll_disable,
+	.is_enabled	= k210_pll_is_enabled,
+	.recalc_rate	= k210_pll_get_rate,
+};
+
+static int k210_pll2_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct k210_pll *pll = to_k210_pll(hw);
+	struct k210_sysclk *ksc = pll->ksc;
+	unsigned long flags;
+	u32 reg;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+
+	reg = readl(pll->reg);
+	reg &= ~K210_PLL_SEL;
+	reg |= FIELD_PREP(K210_PLL_SEL, index);
+	writel(reg, pll->reg);
+
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return 0;
+}
+
+static u8 k210_pll2_get_parent(struct clk_hw *hw)
+{
+	struct k210_pll *pll = to_k210_pll(hw);
+	u32 reg = readl(pll->reg);
+
+	return FIELD_GET(K210_PLL_SEL, reg);
+}
+
+static const struct clk_ops k210_pll2_ops = {
+	.enable		= k210_pll_enable,
+	.disable	= k210_pll_disable,
+	.is_enabled	= k210_pll_is_enabled,
+	.recalc_rate	= k210_pll_get_rate,
+	.set_parent	= k210_pll2_set_parent,
+	.get_parent	= k210_pll2_get_parent,
+};
+
+static int __init k210_register_pll(struct device_node *np,
+				    struct k210_sysclk *ksc,
+				    enum k210_pll_id pllid, const char *name,
+				    int num_parents, const struct clk_ops *ops)
+{
+	struct k210_pll *pll = &ksc->plls[pllid];
+	struct clk_init_data init = {};
+	const struct clk_parent_data parent_data[] = {
+		{ /* .index = 0 for in0 */ },
+		{ .hw = &ksc->plls[K210_PLL0].hw },
+		{ .hw = &ksc->plls[K210_PLL1].hw },
+	};
+
+	init.name = name;
+	init.parent_data = parent_data;
+	init.num_parents = num_parents;
+	init.ops = ops;
+
+	pll->hw.init = &init;
+	pll->ksc = ksc;
+
+	return of_clk_hw_register(np, &pll->hw);
+}
+
+static int __init k210_register_plls(struct device_node *np,
+				     struct k210_sysclk *ksc)
+{
+	int i, ret;
+
+	for (i = 0; i < K210_PLL_NUM; i++)
+		k210_init_pll(ksc->regs, i, &ksc->plls[i]);
+
+	/* PLL0 and PLL1 only have IN0 as parent */
+	ret = k210_register_pll(np, ksc, K210_PLL0, "pll0", 1, &k210_pll_ops);
+	if (ret) {
+		pr_err("%pOFP: register PLL0 failed\n", np);
+		return ret;
+	}
+	ret = k210_register_pll(np, ksc, K210_PLL1, "pll1", 1, &k210_pll_ops);
+	if (ret) {
+		pr_err("%pOFP: register PLL1 failed\n", np);
+		return ret;
+	}
+
+	/* PLL2 has IN0, PLL0 and PLL1 as parents */
+	ret = k210_register_pll(np, ksc, K210_PLL2, "pll2", 3, &k210_pll2_ops);
+	if (ret) {
+		pr_err("%pOFP: register PLL2 failed\n", np);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int k210_aclk_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct k210_sysclk *ksc = to_k210_sysclk(hw);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+
+	k210_aclk_set_selector(ksc->regs, index);
+
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return 0;
+}
+
+static u8 k210_aclk_get_parent(struct clk_hw *hw)
+{
+	struct k210_sysclk *ksc = to_k210_sysclk(hw);
+	u32 sel;
+
+	sel = readl(ksc->regs + K210_SYSCTL_SEL0) & K210_ACLK_SEL;
+
+	return sel ? 1 : 0;
+}
+
+static unsigned long k210_aclk_get_rate(struct clk_hw *hw,
+					unsigned long parent_rate)
+{
+	struct k210_sysclk *ksc = to_k210_sysclk(hw);
+	u32 reg = readl(ksc->regs + K210_SYSCTL_SEL0);
+	unsigned int shift;
+
+	if (!(reg & 0x1))
+		return parent_rate;
+
+	shift = FIELD_GET(K210_ACLK_DIV, reg);
+
+	return parent_rate / (2UL << shift);
+}
+
+static const struct clk_ops k210_aclk_ops = {
+	.set_parent	= k210_aclk_set_parent,
+	.get_parent	= k210_aclk_get_parent,
+	.recalc_rate	= k210_aclk_get_rate,
+};
+
+/*
+ * ACLK has IN0 and PLL0 as parents.
+ */
+static int __init k210_register_aclk(struct device_node *np,
+				     struct k210_sysclk *ksc)
+{
+	struct clk_init_data init = {};
+	const struct clk_parent_data parent_data[] = {
+		{ /* .index = 0 for in0 */ },
+		{ .hw = &ksc->plls[K210_PLL0].hw },
+	};
+	int ret;
+
+	init.name = "aclk";
+	init.parent_data = parent_data;
+	init.num_parents = 2;
+	init.ops = &k210_aclk_ops;
+	ksc->aclk.init = &init;
+
+	ret = of_clk_hw_register(np, &ksc->aclk);
+	if (ret) {
+		pr_err("%pOFP: register aclk failed\n", np);
+		return ret;
+	}
+
+	return 0;
+}
+
+#define to_k210_clk(_hw)	container_of(_hw, struct k210_clk, hw)
+
+static int k210_clk_enable(struct clk_hw *hw)
+{
+	struct k210_clk *kclk = to_k210_clk(hw);
+	struct k210_sysclk *ksc = kclk->ksc;
+	struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id];
+	unsigned long flags;
+	u32 reg;
+
+	if (!cfg->gate_reg)
+		return 0;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+	reg = readl(ksc->regs + cfg->gate_reg);
+	reg |= BIT(cfg->gate_bit);
+	writel(reg, ksc->regs + cfg->gate_reg);
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return 0;
+}
+
+static void k210_clk_disable(struct clk_hw *hw)
+{
+	struct k210_clk *kclk = to_k210_clk(hw);
+	struct k210_sysclk *ksc = kclk->ksc;
+	struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id];
+	unsigned long flags;
+	u32 reg;
+
+	if (!cfg->gate_reg)
+		return;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+	reg = readl(ksc->regs + cfg->gate_reg);
+	reg &= ~BIT(cfg->gate_bit);
+	writel(reg, ksc->regs + cfg->gate_reg);
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+}
+
+static int k210_clk_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct k210_clk *kclk = to_k210_clk(hw);
+	struct k210_sysclk *ksc = kclk->ksc;
+	struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id];
+	unsigned long flags;
+	u32 reg;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+	reg = readl(ksc->regs + cfg->mux_reg);
+	if (index)
+		reg |= BIT(cfg->mux_bit);
+	else
+		reg &= ~BIT(cfg->mux_bit);
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return 0;
+}
+
+static u8 k210_clk_get_parent(struct clk_hw *hw)
+{
+	struct k210_clk *kclk = to_k210_clk(hw);
+	struct k210_sysclk *ksc = kclk->ksc;
+	struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id];
+	unsigned long flags;
+	u32 reg, idx;
+
+	spin_lock_irqsave(&ksc->clk_lock, flags);
+	reg = readl(ksc->regs + cfg->mux_reg);
+	idx = (reg & BIT(cfg->mux_bit)) ? 1 : 0;
+	spin_unlock_irqrestore(&ksc->clk_lock, flags);
+
+	return idx;
+}
+
+static unsigned long k210_clk_get_rate(struct clk_hw *hw,
+				       unsigned long parent_rate)
+{
+	struct k210_clk *kclk = to_k210_clk(hw);
+	struct k210_sysclk *ksc = kclk->ksc;
+	struct k210_clk_cfg *cfg = &k210_clk_cfgs[kclk->id];
+	u32 reg, div_val;
+
+	if (!cfg->div_reg)
+		return parent_rate;
+
+	reg = readl(ksc->regs + cfg->div_reg);
+	div_val = (reg >> cfg->div_shift) & GENMASK(cfg->div_width - 1, 0);
+
+	switch (cfg->div_type) {
+	case K210_DIV_ONE_BASED:
+		return parent_rate / (div_val + 1);
+	case K210_DIV_DOUBLE_ONE_BASED:
+		return parent_rate / ((div_val + 1) * 2);
+	case K210_DIV_POWER_OF_TWO:
+		return parent_rate / (2UL << div_val);
+	case K210_DIV_NONE:
+	default:
+		return 0;
+	}
+}
+
+static const struct clk_ops k210_clk_mux_ops = {
+	.enable		= k210_clk_enable,
+	.disable	= k210_clk_disable,
+	.set_parent	= k210_clk_set_parent,
+	.get_parent	= k210_clk_get_parent,
+	.recalc_rate	= k210_clk_get_rate,
+};
+
+static const struct clk_ops k210_clk_ops = {
+	.enable		= k210_clk_enable,
+	.disable	= k210_clk_disable,
+	.recalc_rate	= k210_clk_get_rate,
+};
+
+static void __init k210_register_clk(struct device_node *np,
+				     struct k210_sysclk *ksc, int id,
+				     const struct clk_parent_data *parent_data,
+				     int num_parents, unsigned long flags)
+{
+	struct k210_clk *kclk = &ksc->clks[id];
+	struct clk_init_data init = {};
+	int ret;
+
+	init.name = k210_clk_cfgs[id].name;
+	init.flags = flags;
+	init.parent_data = parent_data;
+	init.num_parents = num_parents;
+	if (num_parents > 1)
+		init.ops = &k210_clk_mux_ops;
+	else
+		init.ops = &k210_clk_ops;
+
+	kclk->id = id;
+	kclk->ksc = ksc;
+	kclk->hw.init = &init;
+
+	ret = of_clk_hw_register(np, &kclk->hw);
+	if (ret) {
+		pr_err("%pOFP: register clock %s failed\n",
+		       np, k210_clk_cfgs[id].name);
+		kclk->id = -1;
+	}
+}
+
+/*
+ * All muxed clocks have IN0 and PLL0 as parents.
+ */
+static inline void __init k210_register_mux_clk(struct device_node *np,
+						struct k210_sysclk *ksc, int id)
+{
+	const struct clk_parent_data parent_data[2] = {
+		{ /* .index = 0 for in0 */ },
+		{ .hw = &ksc->plls[K210_PLL0].hw }
+	};
+
+	k210_register_clk(np, ksc, id, parent_data, 2, 0);
+}
+
+static inline void __init k210_register_in0_child(struct device_node *np,
+						struct k210_sysclk *ksc, int id)
+{
+	const struct clk_parent_data parent_data = {
+		/* .index = 0 for in0 */
+	};
+
+	k210_register_clk(np, ksc, id, &parent_data, 1, 0);
+}
+
+static inline void __init k210_register_pll_child(struct device_node *np,
+						struct k210_sysclk *ksc, int id,
+						enum k210_pll_id pllid,
+						unsigned long flags)
+{
+	const struct clk_parent_data parent_data = {
+		.hw = &ksc->plls[pllid].hw,
+	};
+
+	k210_register_clk(np, ksc, id, &parent_data, 1, flags);
+}
+
+static inline void __init k210_register_aclk_child(struct device_node *np,
+						struct k210_sysclk *ksc, int id,
+						unsigned long flags)
+{
+	const struct clk_parent_data parent_data = {
+		.hw = &ksc->aclk,
+	};
+
+	k210_register_clk(np, ksc, id, &parent_data, 1, flags);
+}
+
+static inline void __init k210_register_clk_child(struct device_node *np,
+						struct k210_sysclk *ksc, int id,
+						int parent_id)
+{
+	const struct clk_parent_data parent_data = {
+		.hw = &ksc->clks[parent_id].hw,
+	};
+
+	k210_register_clk(np, ksc, id, &parent_data, 1, 0);
+}
+
+static struct clk_hw *k210_clk_hw_onecell_get(struct of_phandle_args *clkspec,
+					      void *data)
+{
+	struct k210_sysclk *ksc = data;
+	unsigned int idx = clkspec->args[0];
+
+	if (idx >= K210_NUM_CLKS)
+		return ERR_PTR(-EINVAL);
+
+	return &ksc->clks[idx].hw;
+}
+
+static void __init k210_clk_init(struct device_node *np)
+{
+	struct device_node *sysctl_np;
+	struct k210_sysclk *ksc;
+	int i, ret;
+
+	ksc = kzalloc(sizeof(*ksc), GFP_KERNEL);
+	if (!ksc)
+		return;
+
+	spin_lock_init(&ksc->clk_lock);
+	sysctl_np = of_get_parent(np);
+	ksc->regs = of_iomap(sysctl_np, 0);
+	of_node_put(sysctl_np);
+	if (!ksc->regs) {
+		pr_err("%pOFP: failed to map registers\n", np);
+		return;
+	}
+
+	ret = k210_register_plls(np, ksc);
+	if (ret)
+		return;
+
+	ret = k210_register_aclk(np, ksc);
+	if (ret)
+		return;
+
+	/*
+	 * Critical clocks: there are no consumers of the SRAM clocks,
+	 * including the AI clock for the third SRAM bank. The CPU clock
+	 * is only referenced by the uarths serial device and so would be
+	 * disabled if the serial console is disabled to switch to another
+	 * console. Mark all these clocks as critical so that they are never
+	 * disabled by the core clock management.
+	 */
+	k210_register_aclk_child(np, ksc, K210_CLK_CPU, CLK_IS_CRITICAL);
+	k210_register_aclk_child(np, ksc, K210_CLK_SRAM0, CLK_IS_CRITICAL);
+	k210_register_aclk_child(np, ksc, K210_CLK_SRAM1, CLK_IS_CRITICAL);
+	k210_register_pll_child(np, ksc, K210_CLK_AI, K210_PLL1,
+				CLK_IS_CRITICAL);
+
+	/* Clocks with aclk as source */
+	k210_register_aclk_child(np, ksc, K210_CLK_DMA, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_FFT, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_ROM, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_DVP, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_APB0, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_APB1, 0);
+	k210_register_aclk_child(np, ksc, K210_CLK_APB2, 0);
+
+	/* Clocks with PLL0 as source */
+	k210_register_pll_child(np, ksc, K210_CLK_SPI0, K210_PLL0, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_SPI1, K210_PLL0, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_SPI2, K210_PLL0, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2C0, K210_PLL0, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2C1, K210_PLL0, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2C2, K210_PLL0, 0);
+
+	/* Clocks with PLL2 as source */
+	k210_register_pll_child(np, ksc, K210_CLK_I2S0, K210_PLL2, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2S1, K210_PLL2, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2S2, K210_PLL2, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2S0_M, K210_PLL2, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2S1_M, K210_PLL2, 0);
+	k210_register_pll_child(np, ksc, K210_CLK_I2S2_M, K210_PLL2, 0);
+
+	/* Clocks with IN0 as source */
+	k210_register_in0_child(np, ksc, K210_CLK_WDT0);
+	k210_register_in0_child(np, ksc, K210_CLK_WDT1);
+	k210_register_in0_child(np, ksc, K210_CLK_RTC);
+
+	/* Clocks with APB0 as source */
+	k210_register_clk_child(np, ksc, K210_CLK_GPIO, K210_CLK_APB0);
+	k210_register_clk_child(np, ksc, K210_CLK_UART1, K210_CLK_APB0);
+	k210_register_clk_child(np, ksc, K210_CLK_UART2, K210_CLK_APB0);
+	k210_register_clk_child(np, ksc, K210_CLK_UART3, K210_CLK_APB0);
+	k210_register_clk_child(np, ksc, K210_CLK_FPIOA, K210_CLK_APB0);
+	k210_register_clk_child(np, ksc, K210_CLK_SHA, K210_CLK_APB0);
+
+	/* Clocks with APB1 as source */
+	k210_register_clk_child(np, ksc, K210_CLK_AES, K210_CLK_APB1);
+	k210_register_clk_child(np, ksc, K210_CLK_OTP, K210_CLK_APB1);
+
+	/* Mux clocks with in0 or pll0 as source */
+	k210_register_mux_clk(np, ksc, K210_CLK_SPI3);
+	k210_register_mux_clk(np, ksc, K210_CLK_TIMER0);
+	k210_register_mux_clk(np, ksc, K210_CLK_TIMER1);
+	k210_register_mux_clk(np, ksc, K210_CLK_TIMER2);
+
+	/* Check for registration errors */
+	for (i = 0; i < K210_NUM_CLKS; i++) {
+		if (ksc->clks[i].id != i)
+			return;
+	}
+
+	ret = of_clk_add_hw_provider(np, k210_clk_hw_onecell_get, ksc);
+	if (ret) {
+		pr_err("%pOFP: add clock provider failed %d\n", np, ret);
+		return;
+	}
+
+	pr_info("%pOFP: CPU running at %lu MHz\n",
+		np, clk_hw_get_rate(&ksc->clks[K210_CLK_CPU].hw) / 1000000);
+}
+
+CLK_OF_DECLARE(k210_clk, "canaan,k210-clk", k210_clk_init);
+
+/*
+ * Enable PLL1 to be able to use the AI SRAM.
+ */
+void __init k210_clk_early_init(void __iomem *regs)
+{
+	struct k210_pll pll1;
+
+	/* Make sure ACLK selector is set to PLL0 */
+	k210_aclk_set_selector(regs, 1);
+
+	/* Startup PLL1 to enable the aisram bank for general memory use */
+	k210_init_pll(regs, K210_PLL1, &pll1);
+	k210_pll_enable_hw(regs, &pll1);
+}
diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig
index 5232d13f07e5..8179b69518b4 100644
--- a/drivers/soc/canaan/Kconfig
+++ b/drivers/soc/canaan/Kconfig
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0
 
-if SOC_CANAAN
-
-config K210_SYSCTL
+config SOC_K210_SYSCTL
 	bool "Canaan Kendryte K210 SoC system controller"
-	default y
-	depends on RISCV
+	depends on RISCV && SOC_CANAAN && OF
+	default SOC_CANAAN
+        select PM
+        select SIMPLE_PM_BUS
+        select SYSCON
+        select MFD_SYSCON
 	help
-	  Enables controlling the K210 various clocks and to enable
-	  general purpose use of the extra 2MB of SRAM normally
-	  reserved for the AI engine.
-
-endif
+	  Canaan Kendryte K210 SoC system controller driver.
diff --git a/drivers/soc/canaan/Makefile b/drivers/soc/canaan/Makefile
index 002d9ce95c0d..570280ad7967 100644
--- a/drivers/soc/canaan/Makefile
+++ b/drivers/soc/canaan/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_K210_SYSCTL)	+= k210-sysctl.o
+obj-$(CONFIG_SOC_K210_SYSCTL)	+= k210-sysctl.o
diff --git a/drivers/soc/canaan/k210-sysctl.c b/drivers/soc/canaan/k210-sysctl.c
index 60b474c33d45..27a346c406bc 100644
--- a/drivers/soc/canaan/k210-sysctl.c
+++ b/drivers/soc/canaan/k210-sysctl.c
@@ -3,165 +3,45 @@
  * Copyright (c) 2019 Christoph Hellwig.
  * Copyright (c) 2019 Western Digital Corporation or its affiliates.
  */
-#include <linux/types.h>
 #include <linux/io.h>
-#include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/clk-provider.h>
-#include <linux/clkdev.h>
-#include <linux/bitfield.h>
+#include <linux/of_platform.h>
+#include <linux/clk.h>
 #include <asm/soc.h>
 
 #include <soc/canaan/k210-sysctl.h>
 
-#define K210_SYSCTL_CLK0_FREQ		26000000UL
-
-/* Registers base address */
-#define K210_SYSCTL_SYSCTL_BASE_ADDR	0x50440000ULL
-
-/* Register bits */
-/* K210_SYSCTL_PLL1: clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */
-#define PLL_RESET		(1 << 20)
-#define PLL_PWR			(1 << 21)
-#define PLL_BYPASS		(1 << 23)
-#define PLL_OUT_EN		(1 << 25)
-/* K210_SYSCTL_PLL_LOCK */
-#define PLL1_LOCK1		(1 << 8)
-#define PLL1_LOCK2		(1 << 9)
-#define PLL1_SLIP_CLEAR		(1 << 10)
-/* K210_SYSCTL_SEL0 */
-#define CLKSEL_ACLK		(1 << 0)
-/* K210_SYSCTL_CLKEN_CENT */
-#define CLKEN_CPU		(1 << 0)
-#define CLKEN_SRAM0		(1 << 1)
-#define CLKEN_SRAM1		(1 << 2)
-/* K210_SYSCTL_EN_PERI */
-#define CLKEN_ROM		(1 << 0)
-#define CLKEN_TIMER0		(1 << 21)
-#define CLKEN_RTC		(1 << 29)
-
-struct k210_sysctl {
-	void __iomem		*regs;
-	struct clk_hw		hw;
-};
-
-static void k210_set_bits(u32 val, void __iomem *reg)
-{
-	writel(readl(reg) | val, reg);
-}
-
-static void k210_clear_bits(u32 val, void __iomem *reg)
-{
-	writel(readl(reg) & ~val, reg);
-}
-
-static void k210_pll1_enable(void __iomem *regs)
-{
-	u32 val;
-
-	val = readl(regs + K210_SYSCTL_PLL1);
-	val &= ~GENMASK(19, 0);				/* clkr1 = 0 */
-	val |= FIELD_PREP(GENMASK(9, 4), 0x3B);		/* clkf1 = 59 */
-	val |= FIELD_PREP(GENMASK(13, 10), 0x3);	/* clkod1 = 3 */
-	val |= FIELD_PREP(GENMASK(19, 14), 0x3B);	/* bwadj1 = 59 */
-	writel(val, regs + K210_SYSCTL_PLL1);
-
-	k210_clear_bits(PLL_BYPASS, regs + K210_SYSCTL_PLL1);
-	k210_set_bits(PLL_PWR, regs + K210_SYSCTL_PLL1);
-
-	/*
-	 * Reset the pll. The magic NOPs come from the Kendryte reference SDK.
-	 */
-	k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
-	k210_set_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
-	nop();
-	nop();
-	k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
-
-	for (;;) {
-		val = readl(regs + K210_SYSCTL_PLL_LOCK);
-		if (val & PLL1_LOCK2)
-			break;
-		writel(val | PLL1_SLIP_CLEAR, regs + K210_SYSCTL_PLL_LOCK);
-	}
-
-	k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL1);
-}
-
-static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw,
-		unsigned long parent_rate)
-{
-	struct k210_sysctl *s = container_of(hw, struct k210_sysctl, hw);
-	u32 clksel0, pll0;
-	u64 pll0_freq, clkr0, clkf0, clkod0;
-
-	/*
-	 * If the clock selector is not set, use the base frequency.
-	 * Otherwise, use PLL0 frequency with a frequency divisor.
-	 */
-	clksel0 = readl(s->regs + K210_SYSCTL_SEL0);
-	if (!(clksel0 & CLKSEL_ACLK))
-		return K210_SYSCTL_CLK0_FREQ;
-
-	/*
-	 * Get PLL0 frequency:
-	 * freq = base frequency * clkf0 / (clkr0 * clkod0)
-	 */
-	pll0 = readl(s->regs + K210_SYSCTL_PLL0);
-	clkr0 = 1 + FIELD_GET(GENMASK(3, 0), pll0);
-	clkf0 = 1 + FIELD_GET(GENMASK(9, 4), pll0);
-	clkod0 = 1 + FIELD_GET(GENMASK(13, 10), pll0);
-	pll0_freq = clkf0 * K210_SYSCTL_CLK0_FREQ / (clkr0 * clkod0);
-
-	/* Get the frequency divisor from the clock selector */
-	return pll0_freq / (2ULL << FIELD_GET(0x00000006, clksel0));
-}
-
-static const struct clk_ops k210_sysctl_clk_ops = {
-	.recalc_rate	= k210_sysctl_clk_recalc_rate,
-};
-
-static const struct clk_init_data k210_clk_init_data = {
-	.name		= "k210-sysctl-pll1",
-	.ops		= &k210_sysctl_clk_ops,
-};
-
 static int k210_sysctl_probe(struct platform_device *pdev)
 {
-	struct k210_sysctl *s;
-	int error;
-
-	pr_info("Kendryte K210 SoC sysctl\n");
-
-	s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-
-	s->regs = devm_ioremap_resource(&pdev->dev,
-			platform_get_resource(pdev, IORESOURCE_MEM, 0));
-	if (IS_ERR(s->regs))
-		return PTR_ERR(s->regs);
-
-	s->hw.init = &k210_clk_init_data;
-	error = devm_clk_hw_register(&pdev->dev, &s->hw);
-	if (error) {
-		dev_err(&pdev->dev, "failed to register clk");
-		return error;
+	struct device *dev = &pdev->dev;
+	struct clk *pclk;
+	int ret;
+
+	dev_info(dev, "K210 system controller\n");
+
+	/* Get power bus clock */
+	pclk = devm_clk_get(dev, NULL);
+	if (IS_ERR(pclk))
+		return dev_err_probe(dev, PTR_ERR(pclk),
+				     "Get bus clock failed\n");
+
+	ret = clk_prepare_enable(pclk);
+	if (ret) {
+		dev_err(dev, "Enable bus clock failed\n");
+		return ret;
 	}
 
-	error = devm_of_clk_add_hw_provider(&pdev->dev, of_clk_hw_simple_get,
-					    &s->hw);
-	if (error) {
-		dev_err(&pdev->dev, "adding clk provider failed\n");
-		return error;
-	}
+	/* Populate children */
+	ret = devm_of_platform_populate(dev);
+	if (ret)
+		dev_err(dev, "Populate platform failed %d\n", ret);
 
-	return 0;
+	return ret;
 }
 
 static const struct of_device_id k210_sysctl_of_match[] = {
-	{ .compatible = "kendryte,k210-sysctl", },
-	{}
+	{ .compatible = "canaan,k210-sysctl", },
+	{ /* sentinel */ },
 };
 
 static struct platform_driver k210_sysctl_driver = {
@@ -171,12 +51,13 @@ static struct platform_driver k210_sysctl_driver = {
 	},
 	.probe			= k210_sysctl_probe,
 };
+builtin_platform_driver(k210_sysctl_driver);
 
-static int __init k210_sysctl_init(void)
-{
-	return platform_driver_register(&k210_sysctl_driver);
-}
-core_initcall(k210_sysctl_init);
+/*
+ * System controller registers base address and size.
+ */
+#define K210_SYSCTL_BASE_ADDR	0x50440000ULL
+#define K210_SYSCTL_BASE_SIZE	0x1000
 
 /*
  * This needs to be called very early during initialization, given that
@@ -184,24 +65,14 @@ core_initcall(k210_sysctl_init);
  */
 static void __init k210_soc_early_init(const void *fdt)
 {
-	void __iomem *regs;
-
-	regs = ioremap(K210_SYSCTL_SYSCTL_BASE_ADDR, 0x1000);
-	if (!regs)
-		panic("K210 sysctl ioremap");
-
-	/* Enable PLL1 to make the KPU SRAM useable */
-	k210_pll1_enable(regs);
-
-	k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0);
+	void __iomem *sysctl_base;
 
-	k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1,
-		      regs + K210_SYSCTL_EN_CENT);
-	k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC,
-		      regs + K210_SYSCTL_EN_PERI);
+	sysctl_base = ioremap(K210_SYSCTL_BASE_ADDR, K210_SYSCTL_BASE_SIZE);
+	if (!sysctl_base)
+		panic("k210-sysctl: ioremap failed");
 
-	k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_SEL0);
+	k210_clk_early_init(sysctl_base);
 
-	iounmap(regs);
+	iounmap(sysctl_base);
 }
-SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init);
+SOC_EARLY_INIT_DECLARE(k210_soc, "canaan,kendryte-k210", k210_soc_early_init);
diff --git a/include/soc/canaan/k210-sysctl.h b/include/soc/canaan/k210-sysctl.h
index 2e037db68f35..0c2b2c2dabca 100644
--- a/include/soc/canaan/k210-sysctl.h
+++ b/include/soc/canaan/k210-sysctl.h
@@ -38,4 +38,6 @@
 #define K210_SYSCTL_DMA_SEL1	0x68 /* DMA handshake selector 1 */
 #define K210_SYSCTL_POWER_SEL	0x6C /* IO Power Mode Select controller */
 
+void k210_clk_early_init(void __iomem *regs);
+
 #endif
-- 
cgit v1.2.3


From 67d96729a9e789ecfddb0f701e5ec18389758dab Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Feb 2021 14:02:23 +0900
Subject: riscv: Update Canaan Kendryte K210 device tree

Update the Canaan Kendryte K210 base device tree k210.dtsi to define
all supported peripherals of the SoC, their clocks and reset lines.
The device tree file k210.dts is renamed to k210_generic.dts and
becomes the default value selection of the configuration option
SOC_CANAAN_K210_DTB_BUILTIN_SOURCE. No device beside the serial console
is defined by this device tree. This makes this generic device tree
suitable for use with a builtin initramfs with all known K210 based
boards.

These changes result in the K210_CLK_ACLK clock ID to be unused and
removed from the dt-bindings k210-clk.h header file.

Most updates to the k210.dtsi file come from Sean Anderson's work on
U-Boot support for the K210.

Cc: Rob Herring <robh@kernel.org>
Cc: devicetree@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/Kconfig.socs                     |   2 +-
 arch/riscv/boot/dts/canaan/k210.dts         |  23 --
 arch/riscv/boot/dts/canaan/k210.dtsi        | 395 ++++++++++++++++++++++++++--
 arch/riscv/boot/dts/canaan/k210_generic.dts |  46 ++++
 include/dt-bindings/clock/k210-clk.h        |   1 -
 5 files changed, 414 insertions(+), 53 deletions(-)
 delete mode 100644 arch/riscv/boot/dts/canaan/k210.dts
 create mode 100644 arch/riscv/boot/dts/canaan/k210_generic.dts

(limited to 'include')

diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs
index 6402746c68f3..7efcece8896c 100644
--- a/arch/riscv/Kconfig.socs
+++ b/arch/riscv/Kconfig.socs
@@ -51,7 +51,7 @@ config SOC_CANAAN_K210_DTB_SOURCE
 	string "Source file for the Canaan Kendryte K210 builtin DTB"
 	depends on SOC_CANAAN
 	depends on SOC_CANAAN_K210_DTB_BUILTIN
-	default "k210"
+	default "k210_generic"
 	help
 	  Base name (without suffix, relative to arch/riscv/boot/dts/canaan)
 	  for the DTS file that will be used to produce the DTB linked into the
diff --git a/arch/riscv/boot/dts/canaan/k210.dts b/arch/riscv/boot/dts/canaan/k210.dts
deleted file mode 100644
index 0d1f28fce6b2..000000000000
--- a/arch/riscv/boot/dts/canaan/k210.dts
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2020 Western Digital Corporation or its affiliates.
- */
-
-/dts-v1/;
-
-#include "k210.dtsi"
-
-/ {
-	model = "Kendryte K210 generic";
-	compatible = "kendryte,k210";
-
-	chosen {
-		bootargs = "earlycon console=ttySIF0";
-		stdout-path = "serial0";
-	};
-};
-
-&uarths0 {
-	status = "okay";
-};
-
diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi
index 354b263195a3..5e8ca8142482 100644
--- a/arch/riscv/boot/dts/canaan/k210.dtsi
+++ b/arch/riscv/boot/dts/canaan/k210.dtsi
@@ -1,9 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
- * Copyright (C) 2019 Sean Anderson <seanga2@gmail.com>
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
  * Copyright (C) 2020 Western Digital Corporation or its affiliates.
  */
 #include <dt-bindings/clock/k210-clk.h>
+#include <dt-bindings/pinctrl/k210-fpioa.h>
+#include <dt-bindings/reset/k210-rst.h>
 
 / {
 	/*
@@ -12,14 +14,17 @@
 	 */
 	#address-cells = <1>;
 	#size-cells = <1>;
-	compatible = "kendryte,k210";
+	compatible = "canaan,kendryte-k210";
 
 	aliases {
 		serial0 = &uarths0;
+		serial1 = &uart1;
+		serial2 = &uart2;
+		serial3 = &uart3;
 	};
 
 	/*
-	 * The K210 has an sv39 MMU following the priviledge specification v1.9.
+	 * The K210 has an sv39 MMU following the privileged specification v1.9.
 	 * Since this is a non-ratified draft specification, the kernel does not
 	 * support it and the K210 support enabled only for the !MMU case.
 	 * Be consistent with this by setting the CPUs MMU type to "none".
@@ -30,14 +35,14 @@
 		timebase-frequency = <7800000>;
 		cpu0: cpu@0 {
 			device_type = "cpu";
+			compatible = "canaan,k210", "riscv";
 			reg = <0>;
-			compatible = "kendryte,k210", "sifive,rocket0", "riscv";
 			riscv,isa = "rv64imafdc";
-			mmu-type = "none";
-			i-cache-size = <0x8000>;
+			mmu-type = "riscv,none";
 			i-cache-block-size = <64>;
-			d-cache-size = <0x8000>;
+			i-cache-size = <0x8000>;
 			d-cache-block-size = <64>;
+			d-cache-size = <0x8000>;
 			cpu0_intc: interrupt-controller {
 				#interrupt-cells = <1>;
 				interrupt-controller;
@@ -46,14 +51,14 @@
 		};
 		cpu1: cpu@1 {
 			device_type = "cpu";
+			compatible = "canaan,k210", "riscv";
 			reg = <1>;
-			compatible = "kendryte,k210", "sifive,rocket0", "riscv";
 			riscv,isa = "rv64imafdc";
-			mmu-type = "none";
-			i-cache-size = <0x8000>;
+			mmu-type = "riscv,none";
 			i-cache-block-size = <64>;
-			d-cache-size = <0x8000>;
+			i-cache-size = <0x8000>;
 			d-cache-block-size = <64>;
+			d-cache-size = <0x8000>;
 			cpu1_intc: interrupt-controller {
 				#interrupt-cells = <1>;
 				interrupt-controller;
@@ -64,10 +69,15 @@
 
 	sram: memory@80000000 {
 		device_type = "memory";
+		compatible = "canaan,k210-sram";
 		reg = <0x80000000 0x400000>,
 		      <0x80400000 0x200000>,
 		      <0x80600000 0x200000>;
 		reg-names = "sram0", "sram1", "aisram";
+		clocks = <&sysclk K210_CLK_SRAM0>,
+			 <&sysclk K210_CLK_SRAM1>,
+			 <&sysclk K210_CLK_AI>;
+		clock-names = "sram0", "sram1", "aisram";
 	};
 
 	clocks {
@@ -81,40 +91,369 @@
 	soc {
 		#address-cells = <1>;
 		#size-cells = <1>;
-		compatible = "kendryte,k210-soc", "simple-bus";
+		compatible = "simple-bus";
 		ranges;
 		interrupt-parent = <&plic0>;
 
-		sysctl: sysctl@50440000 {
-			compatible = "kendryte,k210-sysctl", "simple-mfd";
-			reg = <0x50440000 0x1000>;
-			#clock-cells = <1>;
+		rom0: nvmem@1000 {
+			reg = <0x1000 0x1000>;
+			read-only;
 		};
 
-		clint0: clint@2000000 {
-			#interrupt-cells = <1>;
-			compatible = "riscv,clint0";
+		clint0: timer@2000000 {
+			compatible = "canaan,k210-clint", "sifive,clint0";
 			reg = <0x2000000 0xC000>;
-			interrupts-extended =  <&cpu0_intc 3 &cpu0_intc 7
-						&cpu1_intc 3 &cpu1_intc 7>;
+			interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7
+					      &cpu1_intc 3 &cpu1_intc 7>;
 		};
 
 		plic0: interrupt-controller@c000000 {
 			#interrupt-cells = <1>;
-			interrupt-controller;
-			compatible = "kendryte,k210-plic0", "riscv,plic0";
+			#address-cells = <0>;
+			compatible = "canaan,k210-plic", "sifive,plic-1.0.0";
 			reg = <0xC000000 0x4000000>;
-			interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 0xffffffff>,
-					      <&cpu1_intc 11>, <&cpu1_intc 0xffffffff>;
+			interrupt-controller;
+			interrupts-extended = <&cpu0_intc 11 &cpu1_intc 11>;
 			riscv,ndev = <65>;
-			riscv,max-priority = <7>;
 		};
 
 		uarths0: serial@38000000 {
-			compatible = "kendryte,k210-uarths", "sifive,uart0";
+			compatible = "canaan,k210-uarths", "sifive,uart0";
 			reg = <0x38000000 0x1000>;
 			interrupts = <33>;
-			clocks = <&sysctl K210_CLK_CPU>;
+			clocks = <&sysclk K210_CLK_CPU>;
+		};
+
+		gpio0: gpio-controller@38001000 {
+			#interrupt-cells = <2>;
+			#gpio-cells = <2>;
+			compatible = "canaan,k210-gpiohs", "sifive,gpio0";
+			reg = <0x38001000 0x1000>;
+			interrupt-controller;
+			interrupts = <34 35 36 37 38 39 40 41
+				      42 43 44 45 46 47 48 49
+				      50 51 52 53 54 55 56 57
+				      58 59 60 61 62 63 64 65>;
+			gpio-controller;
+			ngpios = <32>;
+		};
+
+		dmac0: dma-controller@50000000 {
+			compatible = "snps,axi-dma-1.01a";
+			reg = <0x50000000 0x1000>;
+			interrupts = <27 28 29 30 31 32>;
+			#dma-cells = <1>;
+			clocks = <&sysclk K210_CLK_DMA>, <&sysclk K210_CLK_DMA>;
+			clock-names = "core-clk", "cfgr-clk";
+			resets = <&sysrst K210_RST_DMA>;
+			dma-channels = <6>;
+			snps,dma-masters = <2>;
+			snps,priority = <0 1 2 3 4 5>;
+			snps,data-width = <5>;
+			snps,block-size = <0x200000 0x200000 0x200000
+					   0x200000 0x200000 0x200000>;
+			snps,axi-max-burst-len = <256>;
+		};
+
+		apb0: bus@50200000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "simple-pm-bus";
+			ranges;
+			clocks = <&sysclk K210_CLK_APB0>;
+
+			gpio1: gpio@50200000 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "snps,dw-apb-gpio";
+				reg = <0x50200000 0x80>;
+				clocks = <&sysclk K210_CLK_APB0>,
+					 <&sysclk K210_CLK_GPIO>;
+				clock-names = "bus", "db";
+				resets = <&sysrst K210_RST_GPIO>;
+
+				gpio1_0: gpio-port@0 {
+					#gpio-cells = <2>;
+					#interrupt-cells = <2>;
+					compatible = "snps,dw-apb-gpio-port";
+					reg = <0>;
+					interrupt-controller;
+					interrupts = <23>;
+					gpio-controller;
+					ngpios = <8>;
+				};
+			};
+
+			uart1: serial@50210000 {
+				compatible = "snps,dw-apb-uart";
+				reg = <0x50210000 0x100>;
+				interrupts = <11>;
+				clocks = <&sysclk K210_CLK_UART1>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "baudclk", "apb_pclk";
+				resets = <&sysrst K210_RST_UART1>;
+				reg-io-width = <4>;
+				reg-shift = <2>;
+				dcd-override;
+				dsr-override;
+				cts-override;
+				ri-override;
+			};
+
+			uart2: serial@50220000 {
+				compatible = "snps,dw-apb-uart";
+				reg = <0x50220000 0x100>;
+				interrupts = <12>;
+				clocks = <&sysclk K210_CLK_UART2>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "baudclk", "apb_pclk";
+				resets = <&sysrst K210_RST_UART2>;
+				reg-io-width = <4>;
+				reg-shift = <2>;
+				dcd-override;
+				dsr-override;
+				cts-override;
+				ri-override;
+			};
+
+			uart3: serial@50230000 {
+				compatible = "snps,dw-apb-uart";
+				reg = <0x50230000 0x100>;
+				interrupts = <13>;
+				clocks = <&sysclk K210_CLK_UART3>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "baudclk", "apb_pclk";
+				resets = <&sysrst K210_RST_UART3>;
+				reg-io-width = <4>;
+				reg-shift = <2>;
+				dcd-override;
+				dsr-override;
+				cts-override;
+				ri-override;
+			};
+
+			spi2: spi@50240000 {
+				compatible = "canaan,k210-spi";
+				spi-slave;
+				reg = <0x50240000 0x100>;
+				#address-cells = <0>;
+				#size-cells = <0>;
+				interrupts = <3>;
+				clocks = <&sysclk K210_CLK_SPI2>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "ssi_clk", "pclk";
+				resets = <&sysrst K210_RST_SPI2>;
+				spi-max-frequency = <25000000>;
+			};
+
+			i2s0: i2s@50250000 {
+				compatible = "snps,designware-i2s";
+				reg = <0x50250000 0x200>;
+				interrupts = <5>;
+				clocks = <&sysclk K210_CLK_I2S0>;
+				clock-names = "i2sclk";
+				resets = <&sysrst K210_RST_I2S0>;
+			};
+
+			i2s1: i2s@50260000 {
+				compatible = "snps,designware-i2s";
+				reg = <0x50260000 0x200>;
+				interrupts = <6>;
+				clocks = <&sysclk K210_CLK_I2S1>;
+				clock-names = "i2sclk";
+				resets = <&sysrst K210_RST_I2S1>;
+			};
+
+			i2s2: i2s@50270000 {
+				compatible = "snps,designware-i2s";
+				reg = <0x50270000 0x200>;
+				interrupts = <7>;
+				clocks = <&sysclk K210_CLK_I2S2>;
+				clock-names = "i2sclk";
+				resets = <&sysrst K210_RST_I2S2>;
+			};
+
+			i2c0: i2c@50280000 {
+				compatible = "snps,designware-i2c";
+				reg = <0x50280000 0x100>;
+				interrupts = <8>;
+				clocks = <&sysclk K210_CLK_I2C0>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "ref", "pclk";
+				resets = <&sysrst K210_RST_I2C0>;
+			};
+
+			i2c1: i2c@50290000 {
+				compatible = "snps,designware-i2c";
+				reg = <0x50290000 0x100>;
+				interrupts = <9>;
+				clocks = <&sysclk K210_CLK_I2C1>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "ref", "pclk";
+				resets = <&sysrst K210_RST_I2C1>;
+			};
+
+			i2c2: i2c@502a0000 {
+				compatible = "snps,designware-i2c";
+				reg = <0x502A0000 0x100>;
+				interrupts = <10>;
+				clocks = <&sysclk K210_CLK_I2C2>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "ref", "pclk";
+				resets = <&sysrst K210_RST_I2C2>;
+			};
+
+			fpioa: pinmux@502b0000 {
+				compatible = "canaan,k210-fpioa";
+				reg = <0x502B0000 0x100>;
+				clocks = <&sysclk K210_CLK_FPIOA>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "ref", "pclk";
+				resets = <&sysrst K210_RST_FPIOA>;
+				canaan,k210-sysctl-power = <&sysctl 108>;
+			};
+
+			timer0: timer@502d0000 {
+				compatible = "snps,dw-apb-timer";
+				reg = <0x502D0000 0x100>;
+				interrupts = <14 15>;
+				clocks = <&sysclk K210_CLK_TIMER0>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "timer", "pclk";
+				resets = <&sysrst K210_RST_TIMER0>;
+			};
+
+			timer1: timer@502e0000 {
+				compatible = "snps,dw-apb-timer";
+				reg = <0x502E0000 0x100>;
+				interrupts = <16 17>;
+				clocks = <&sysclk K210_CLK_TIMER1>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "timer", "pclk";
+				resets = <&sysrst K210_RST_TIMER1>;
+			};
+
+			timer2: timer@502f0000 {
+				compatible = "snps,dw-apb-timer";
+				reg = <0x502F0000 0x100>;
+				interrupts = <18 19>;
+				clocks = <&sysclk K210_CLK_TIMER2>,
+					 <&sysclk K210_CLK_APB0>;
+				clock-names = "timer", "pclk";
+				resets = <&sysrst K210_RST_TIMER2>;
+			};
+		};
+
+		apb1: bus@50400000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "simple-pm-bus";
+			ranges;
+			clocks = <&sysclk K210_CLK_APB1>;
+
+			wdt0: watchdog@50400000 {
+				compatible = "snps,dw-wdt";
+				reg = <0x50400000 0x100>;
+				interrupts = <21>;
+				clocks = <&sysclk K210_CLK_WDT0>,
+					 <&sysclk K210_CLK_APB1>;
+				clock-names = "tclk", "pclk";
+				resets = <&sysrst K210_RST_WDT0>;
+			};
+
+			wdt1: watchdog@50410000 {
+				compatible = "snps,dw-wdt";
+				reg = <0x50410000 0x100>;
+				interrupts = <22>;
+				clocks = <&sysclk K210_CLK_WDT1>,
+					 <&sysclk K210_CLK_APB1>;
+				clock-names = "tclk", "pclk";
+				resets = <&sysrst K210_RST_WDT1>;
+			};
+
+			sysctl: syscon@50440000 {
+				compatible = "canaan,k210-sysctl",
+					     "syscon", "simple-mfd";
+				reg = <0x50440000 0x100>;
+				clocks = <&sysclk K210_CLK_APB1>;
+				clock-names = "pclk";
+
+				sysclk: clock-controller {
+					#clock-cells = <1>;
+					compatible = "canaan,k210-clk";
+					clocks = <&in0>;
+				};
+
+				sysrst: reset-controller {
+					compatible = "canaan,k210-rst";
+					#reset-cells = <1>;
+				};
+
+				reboot: syscon-reboot {
+					compatible = "syscon-reboot";
+					regmap = <&sysctl>;
+					offset = <48>;
+					mask = <1>;
+					value = <1>;
+				};
+			};
+		};
+
+		apb2: bus@52000000 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "simple-pm-bus";
+			ranges;
+			clocks = <&sysclk K210_CLK_APB2>;
+
+			spi0: spi@52000000 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "canaan,k210-spi";
+				reg = <0x52000000 0x100>;
+				interrupts = <1>;
+				clocks = <&sysclk K210_CLK_SPI0>,
+					 <&sysclk K210_CLK_APB2>;
+				clock-names = "ssi_clk", "pclk";
+				resets = <&sysrst K210_RST_SPI0>;
+				reset-names = "spi";
+				spi-max-frequency = <25000000>;
+				num-cs = <4>;
+				reg-io-width = <4>;
+			};
+
+			spi1: spi@53000000 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "canaan,k210-spi";
+				reg = <0x53000000 0x100>;
+				interrupts = <2>;
+				clocks = <&sysclk K210_CLK_SPI1>,
+					 <&sysclk K210_CLK_APB2>;
+				clock-names = "ssi_clk", "pclk";
+				resets = <&sysrst K210_RST_SPI1>;
+				reset-names = "spi";
+				spi-max-frequency = <25000000>;
+				num-cs = <4>;
+				reg-io-width = <4>;
+			};
+
+			spi3: spi@54000000 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "snps,dwc-ssi-1.01a";
+				reg = <0x54000000 0x200>;
+				interrupts = <4>;
+				clocks = <&sysclk K210_CLK_SPI3>,
+					 <&sysclk K210_CLK_APB2>;
+				clock-names = "ssi_clk", "pclk";
+				resets = <&sysrst K210_RST_SPI3>;
+				reset-names = "spi";
+				/* Could possibly go up to 200 MHz */
+				spi-max-frequency = <100000000>;
+				num-cs = <4>;
+				reg-io-width = <4>;
+			};
 		};
 	};
 };
diff --git a/arch/riscv/boot/dts/canaan/k210_generic.dts b/arch/riscv/boot/dts/canaan/k210_generic.dts
new file mode 100644
index 000000000000..396c8ca4d24d
--- /dev/null
+++ b/arch/riscv/boot/dts/canaan/k210_generic.dts
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+/dts-v1/;
+
+#include "k210.dtsi"
+
+#include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/input/input.h>
+
+/ {
+	model = "Kendryte K210 generic";
+	compatible = "canaan,kendryte-k210";
+
+	chosen {
+		bootargs = "earlycon console=ttySIF0";
+		stdout-path = "serial0:115200n8";
+	};
+};
+
+&fpioa {
+	pinctrl-0 = <&jtag_pins>;
+	pinctrl-names = "default";
+	status = "okay";
+
+	jtag_pins: jtag-pinmux {
+		pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>,
+			 <K210_FPIOA(1, K210_PCF_JTAG_TDI)>,
+			 <K210_FPIOA(2, K210_PCF_JTAG_TMS)>,
+			 <K210_FPIOA(3, K210_PCF_JTAG_TDO)>;
+	};
+
+	uarths_pins: uarths-pinmux {
+		pinmux = <K210_FPIOA(4, K210_PCF_UARTHS_RX)>,
+			 <K210_FPIOA(5, K210_PCF_UARTHS_TX)>;
+	};
+};
+
+&uarths0 {
+	pinctrl-0 = <&uarths_pins>;
+	pinctrl-names = "default";
+	status = "okay";
+};
diff --git a/include/dt-bindings/clock/k210-clk.h b/include/dt-bindings/clock/k210-clk.h
index a48176ad3c23..b2de702cbf75 100644
--- a/include/dt-bindings/clock/k210-clk.h
+++ b/include/dt-bindings/clock/k210-clk.h
@@ -9,7 +9,6 @@
 /*
  * Kendryte K210 SoC clock identifiers (arbitrary values).
  */
-#define K210_CLK_ACLK	0
 #define K210_CLK_CPU	0
 #define K210_CLK_SRAM0	1
 #define K210_CLK_SRAM1	2
-- 
cgit v1.2.3


From 1c73e0c5e54d5f7d77f422a10b03ebe61eaed5ad Mon Sep 17 00:00:00 2001
From: Aleksandr Miloserdov <a.miloserdov@yadro.com>
Date: Tue, 9 Feb 2021 10:22:01 +0300
Subject: scsi: target: core: Add cmd length set before cmd complete

TCM doesn't properly handle underflow case for service actions. One way to
prevent it is to always complete command with
target_complete_cmd_with_length(), however it requires access to data_sg,
which is not always available.

This change introduces target_set_cmd_data_length() function which allows
to set command data length before completing it.

Link: https://lore.kernel.org/r/20210209072202.41154-2-a.miloserdov@yadro.com
Reviewed-by: Roman Bolshakov <r.bolshakov@yadro.com>
Reviewed-by: Bodo Stroesser <bostroesser@gmail.com>
Signed-off-by: Aleksandr Miloserdov <a.miloserdov@yadro.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_transport.c | 15 +++++++++++----
 include/target/target_core_backend.h   |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 93ea17cbad79..5ecb9f18a53d 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -879,11 +879,9 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status)
 }
 EXPORT_SYMBOL(target_complete_cmd);
 
-void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
+void target_set_cmd_data_length(struct se_cmd *cmd, int length)
 {
-	if ((scsi_status == SAM_STAT_GOOD ||
-	     cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) &&
-	    length < cmd->data_length) {
+	if (length < cmd->data_length) {
 		if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
 			cmd->residual_count += cmd->data_length - length;
 		} else {
@@ -893,6 +891,15 @@ void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int len
 
 		cmd->data_length = length;
 	}
+}
+EXPORT_SYMBOL(target_set_cmd_data_length);
+
+void target_complete_cmd_with_length(struct se_cmd *cmd, u8 scsi_status, int length)
+{
+	if (scsi_status == SAM_STAT_GOOD ||
+	    cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL) {
+		target_set_cmd_data_length(cmd, length);
+	}
 
 	target_complete_cmd(cmd, scsi_status);
 }
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index 6336780d83a7..ce2fba49c95d 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -72,6 +72,7 @@ int	transport_backend_register(const struct target_backend_ops *);
 void	target_backend_unregister(const struct target_backend_ops *);
 
 void	target_complete_cmd(struct se_cmd *, u8);
+void	target_set_cmd_data_length(struct se_cmd *, int);
 void	target_complete_cmd_with_length(struct se_cmd *, u8, int);
 
 void	transport_copy_sense_to_cmd(struct se_cmd *, unsigned char *);
-- 
cgit v1.2.3


From faf7f3fdd151a03df68de3cb90bb5c394a6774c2 Mon Sep 17 00:00:00 2001
From: Philip Chen <philipchen@chromium.org>
Date: Mon, 22 Feb 2021 21:01:26 -0800
Subject: dt-bindings: input: Create macros for cros-ec keymap

In Chrome OS, the keyboard matrix can be split to two groups:

The keymap for the top row keys can be customized based on OEM
preference, while the keymap for the other keys is generic/fixed
across boards.

This patch creates marcos for the keymaps of these two groups, making
it easier to reuse the generic portion of keymap when we override the
keymap in the board-specific dts for custom top row design.

Signed-off-by: Philip Chen <philipchen@chromium.org>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/20210115143555.v6.1.Iaa8a60cf2ed4b7ad5e2fbb4ad76a1c600ee36113@changeid
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/dt-bindings/input/cros-ec-keyboard.h | 103 +++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 include/dt-bindings/input/cros-ec-keyboard.h

(limited to 'include')

diff --git a/include/dt-bindings/input/cros-ec-keyboard.h b/include/dt-bindings/input/cros-ec-keyboard.h
new file mode 100644
index 000000000000..a37a8c570121
--- /dev/null
+++ b/include/dt-bindings/input/cros-ec-keyboard.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This header provides the constants of the standard Chrome OS key matrix
+ * for cros-ec keyboard-controller bindings.
+ *
+ * Copyright (c) 2021 Google, Inc
+ */
+
+#ifndef _CROS_EC_KEYBOARD_H
+#define _CROS_EC_KEYBOARD_H
+
+#define CROS_STD_TOP_ROW_KEYMAP	\
+	MATRIX_KEY(0x00, 0x02, KEY_F1)	\
+	MATRIX_KEY(0x03, 0x02, KEY_F2)	\
+	MATRIX_KEY(0x02, 0x02, KEY_F3)	\
+	MATRIX_KEY(0x01, 0x02, KEY_F4)	\
+	MATRIX_KEY(0x03, 0x04, KEY_F5)	\
+	MATRIX_KEY(0x02, 0x04, KEY_F6)	\
+	MATRIX_KEY(0x01, 0x04, KEY_F7)	\
+	MATRIX_KEY(0x02, 0x09, KEY_F8)	\
+	MATRIX_KEY(0x01, 0x09, KEY_F9)	\
+	MATRIX_KEY(0x00, 0x04, KEY_F10)	\
+	MATRIX_KEY(0x03, 0x09, KEY_F13)
+
+#define CROS_STD_MAIN_KEYMAP	\
+	MATRIX_KEY(0x00, 0x01, KEY_LEFTMETA)	\
+	MATRIX_KEY(0x00, 0x03, KEY_B)		\
+	MATRIX_KEY(0x00, 0x05, KEY_RO)		\
+	MATRIX_KEY(0x00, 0x06, KEY_N)		\
+	MATRIX_KEY(0x00, 0x08, KEY_EQUAL)	\
+	MATRIX_KEY(0x00, 0x0a, KEY_RIGHTALT)	\
+	MATRIX_KEY(0x01, 0x01, KEY_ESC)		\
+	MATRIX_KEY(0x01, 0x03, KEY_G)		\
+	MATRIX_KEY(0x01, 0x06, KEY_H)		\
+	MATRIX_KEY(0x01, 0x08, KEY_APOSTROPHE)	\
+	MATRIX_KEY(0x01, 0x0b, KEY_BACKSPACE)	\
+	MATRIX_KEY(0x01, 0x0c, KEY_HENKAN)	\
+						\
+	MATRIX_KEY(0x02, 0x00, KEY_LEFTCTRL)	\
+	MATRIX_KEY(0x02, 0x01, KEY_TAB)		\
+	MATRIX_KEY(0x02, 0x03, KEY_T)		\
+	MATRIX_KEY(0x02, 0x05, KEY_RIGHTBRACE)	\
+	MATRIX_KEY(0x02, 0x06, KEY_Y)		\
+	MATRIX_KEY(0x02, 0x07, KEY_102ND)	\
+	MATRIX_KEY(0x02, 0x08, KEY_LEFTBRACE)	\
+	MATRIX_KEY(0x02, 0x0a, KEY_YEN)		\
+						\
+	MATRIX_KEY(0x03, 0x00, KEY_LEFTMETA)	\
+	MATRIX_KEY(0x03, 0x01, KEY_GRAVE)	\
+	MATRIX_KEY(0x03, 0x03, KEY_5)		\
+	MATRIX_KEY(0x03, 0x06, KEY_6)		\
+	MATRIX_KEY(0x03, 0x08, KEY_MINUS)	\
+	MATRIX_KEY(0x03, 0x0b, KEY_BACKSLASH)	\
+	MATRIX_KEY(0x03, 0x0c, KEY_MUHENKAN)	\
+						\
+	MATRIX_KEY(0x04, 0x00, KEY_RIGHTCTRL)	\
+	MATRIX_KEY(0x04, 0x01, KEY_A)		\
+	MATRIX_KEY(0x04, 0x02, KEY_D)		\
+	MATRIX_KEY(0x04, 0x03, KEY_F)		\
+	MATRIX_KEY(0x04, 0x04, KEY_S)		\
+	MATRIX_KEY(0x04, 0x05, KEY_K)		\
+	MATRIX_KEY(0x04, 0x06, KEY_J)		\
+	MATRIX_KEY(0x04, 0x08, KEY_SEMICOLON)	\
+	MATRIX_KEY(0x04, 0x09, KEY_L)		\
+	MATRIX_KEY(0x04, 0x0a, KEY_BACKSLASH)	\
+	MATRIX_KEY(0x04, 0x0b, KEY_ENTER)	\
+						\
+	MATRIX_KEY(0x05, 0x01, KEY_Z)		\
+	MATRIX_KEY(0x05, 0x02, KEY_C)		\
+	MATRIX_KEY(0x05, 0x03, KEY_V)		\
+	MATRIX_KEY(0x05, 0x04, KEY_X)		\
+	MATRIX_KEY(0x05, 0x05, KEY_COMMA)	\
+	MATRIX_KEY(0x05, 0x06, KEY_M)		\
+	MATRIX_KEY(0x05, 0x07, KEY_LEFTSHIFT)	\
+	MATRIX_KEY(0x05, 0x08, KEY_SLASH)	\
+	MATRIX_KEY(0x05, 0x09, KEY_DOT)		\
+	MATRIX_KEY(0x05, 0x0b, KEY_SPACE)	\
+						\
+	MATRIX_KEY(0x06, 0x01, KEY_1)		\
+	MATRIX_KEY(0x06, 0x02, KEY_3)		\
+	MATRIX_KEY(0x06, 0x03, KEY_4)		\
+	MATRIX_KEY(0x06, 0x04, KEY_2)		\
+	MATRIX_KEY(0x06, 0x05, KEY_8)		\
+	MATRIX_KEY(0x06, 0x06, KEY_7)		\
+	MATRIX_KEY(0x06, 0x08, KEY_0)		\
+	MATRIX_KEY(0x06, 0x09, KEY_9)		\
+	MATRIX_KEY(0x06, 0x0a, KEY_LEFTALT)	\
+	MATRIX_KEY(0x06, 0x0b, KEY_DOWN)	\
+	MATRIX_KEY(0x06, 0x0c, KEY_RIGHT)	\
+						\
+	MATRIX_KEY(0x07, 0x01, KEY_Q)		\
+	MATRIX_KEY(0x07, 0x02, KEY_E)		\
+	MATRIX_KEY(0x07, 0x03, KEY_R)		\
+	MATRIX_KEY(0x07, 0x04, KEY_W)		\
+	MATRIX_KEY(0x07, 0x05, KEY_I)		\
+	MATRIX_KEY(0x07, 0x06, KEY_U)		\
+	MATRIX_KEY(0x07, 0x07, KEY_RIGHTSHIFT)	\
+	MATRIX_KEY(0x07, 0x08, KEY_P)		\
+	MATRIX_KEY(0x07, 0x09, KEY_O)		\
+	MATRIX_KEY(0x07, 0x0b, KEY_UP)		\
+	MATRIX_KEY(0x07, 0x0c, KEY_LEFT)
+
+#endif /* _CROS_EC_KEYBOARD_H */
-- 
cgit v1.2.3


From 3d283f0b076442354f301461bece737d3c109a1b Mon Sep 17 00:00:00 2001
From: Philip Chen <philipchen@chromium.org>
Date: Mon, 22 Feb 2021 21:11:07 -0800
Subject: dt-bindings: input: Fix the keymap for LOCK key

Decouple LOCK from F13 and directly map the LOCK key (KSI3/KSO9) to
KEY_SLEEP action key code.

Signed-off-by: Philip Chen <philipchen@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210115143555.v6.3.I96134907488f41f358d03f3c1b08194f9547e670@changeid
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/dt-bindings/input/cros-ec-keyboard.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/input/cros-ec-keyboard.h b/include/dt-bindings/input/cros-ec-keyboard.h
index a37a8c570121..f0ae03634a96 100644
--- a/include/dt-bindings/input/cros-ec-keyboard.h
+++ b/include/dt-bindings/input/cros-ec-keyboard.h
@@ -19,8 +19,7 @@
 	MATRIX_KEY(0x01, 0x04, KEY_F7)	\
 	MATRIX_KEY(0x02, 0x09, KEY_F8)	\
 	MATRIX_KEY(0x01, 0x09, KEY_F9)	\
-	MATRIX_KEY(0x00, 0x04, KEY_F10)	\
-	MATRIX_KEY(0x03, 0x09, KEY_F13)
+	MATRIX_KEY(0x00, 0x04, KEY_F10)
 
 #define CROS_STD_MAIN_KEYMAP	\
 	MATRIX_KEY(0x00, 0x01, KEY_LEFTMETA)	\
@@ -50,6 +49,7 @@
 	MATRIX_KEY(0x03, 0x03, KEY_5)		\
 	MATRIX_KEY(0x03, 0x06, KEY_6)		\
 	MATRIX_KEY(0x03, 0x08, KEY_MINUS)	\
+	MATRIX_KEY(0x03, 0x09, KEY_SLEEP)	\
 	MATRIX_KEY(0x03, 0x0b, KEY_BACKSLASH)	\
 	MATRIX_KEY(0x03, 0x0c, KEY_MUHENKAN)	\
 						\
-- 
cgit v1.2.3


From 49387f628840eac1e7e1113f4f2c150cdecf88c7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Tue, 23 Feb 2021 11:36:21 +0000
Subject: vmlinux.lds.h: catch even more instrumentation symbols into .data

LKP caught another bunch of orphaned instrumentation symbols [0]:

mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from
`init/main.o' being placed in section `.data.$LPBX1'
mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from
`init/main.o' being placed in section `.data.$LPBX0'
mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from
`init/do_mounts.o' being placed in section `.data.$LPBX1'
mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from
`init/do_mounts.o' being placed in section `.data.$LPBX0'
mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from
`init/do_mounts_initrd.o' being placed in section `.data.$LPBX1'
mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from
`init/do_mounts_initrd.o' being placed in section `.data.$LPBX0'
mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from
`init/initramfs.o' being placed in section `.data.$LPBX1'
mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from
`init/initramfs.o' being placed in section `.data.$LPBX0'
mipsel-linux-ld: warning: orphan section `.data.$LPBX1' from
`init/calibrate.o' being placed in section `.data.$LPBX1'
mipsel-linux-ld: warning: orphan section `.data.$LPBX0' from
`init/calibrate.o' being placed in section `.data.$LPBX0'

[...]

Soften the wildcard to .data.$L* to grab these ones into .data too.

[0] https://lore.kernel.org/lkml/202102231519.lWPLPveV-lkp@intel.com

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 83537e5ee78f..ecfb54e9fda5 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -95,7 +95,7 @@
  */
 #ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 #define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
-#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$Lubsan_*
+#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L*
 #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]*
 #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L*
 #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..compoundliteral*
-- 
cgit v1.2.3


From fd70a406a344e084ac680c3f14e71d37d6023883 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:31:59 +0200
Subject: vdpa: Extend routine to accept vdpa device name

In a subsequent patch, when user initiated command creates a vdpa device,
the user chooses the name of the vdpa device.
To support it, extend the device allocation API to consider this name
specified by the caller driver.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-3-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c   |  2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c |  2 +-
 drivers/vdpa/vdpa.c               | 36 ++++++++++++++++++++++++++++++++----
 drivers/vdpa/vdpa_sim/vdpa_sim.c  |  2 +-
 include/linux/vdpa.h              |  7 +++----
 5 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index fa1af301cf55..7c8bbfcf6c3e 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -432,7 +432,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
 				    dev, &ifc_vdpa_ops,
-				    IFCVF_MAX_QUEUE_PAIRS * 2);
+				    IFCVF_MAX_QUEUE_PAIRS * 2, NULL);
 	if (adapter == NULL) {
 		IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
 		return -ENOMEM;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b5fe6d2ad22f..dc88559a8d49 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1982,7 +1982,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-				 2 * mlx5_vdpa_max_qps(max_vqs));
+				 2 * mlx5_vdpa_max_qps(max_vqs), NULL);
 	if (IS_ERR(ndev))
 		return PTR_ERR(ndev);
 
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index c0825650c055..7414bbd9057c 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/vdpa.h>
 
+/* A global mutex that protects vdpa management device and device level operations. */
+static DEFINE_MUTEX(vdpa_dev_mutex);
 static DEFINE_IDA(vdpa_index_ida);
 
 static int vdpa_dev_probe(struct device *d)
@@ -63,6 +65,7 @@ static void vdpa_release_dev(struct device *d)
  * @config: the bus operations that is supported by this device
  * @nvqs: number of virtqueues supported by this device
  * @size: size of the parent structure that contains private data
+ * @name: name of the vdpa device; optional.
  *
  * Driver should use vdpa_alloc_device() wrapper macro instead of
  * using this directly.
@@ -72,8 +75,7 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs,
-					size_t size)
+					int nvqs, size_t size, const char *name)
 {
 	struct vdpa_device *vdev;
 	int err = -EINVAL;
@@ -101,7 +103,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 	vdev->features_valid = false;
 	vdev->nvqs = nvqs;
 
-	err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+	if (name)
+		err = dev_set_name(&vdev->dev, "%s", name);
+	else
+		err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
 	if (err)
 		goto err_name;
 
@@ -118,6 +123,13 @@ err:
 }
 EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
 
+static int vdpa_name_match(struct device *dev, const void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+
+	return (strcmp(dev_name(&vdev->dev), data) == 0);
+}
+
 /**
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
@@ -127,7 +139,21 @@ EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
  */
 int vdpa_register_device(struct vdpa_device *vdev)
 {
-	return device_add(&vdev->dev);
+	struct device *dev;
+	int err;
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+	if (dev) {
+		put_device(dev);
+		err = -EEXIST;
+		goto name_err;
+	}
+
+	err = device_add(&vdev->dev);
+name_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
 
@@ -137,7 +163,9 @@ EXPORT_SYMBOL_GPL(vdpa_register_device);
  */
 void vdpa_unregister_device(struct vdpa_device *vdev)
 {
+	mutex_lock(&vdpa_dev_mutex);
 	device_unregister(&vdev->dev);
+	mutex_unlock(&vdpa_dev_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_unregister_device);
 
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index b3fcc67bfdf0..db1636a99ba4 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
 		ops = &vdpasim_config_ops;
 
 	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-				    dev_attr->nvqs);
+				    dev_attr->nvqs, NULL);
 	if (!vdpasim)
 		goto err_alloc;
 
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 0fefeb976877..5700baa22356 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -245,15 +245,14 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs,
-					size_t size);
+					int nvqs, size_t size, const char *name);
 
-#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs, name)   \
 			  container_of(__vdpa_alloc_device( \
 				       parent, config, nvqs, \
 				       sizeof(dev_struct) + \
 				       BUILD_BUG_ON_ZERO(offsetof( \
-				       dev_struct, member))), \
+				       dev_struct, member)), name), \
 				       dev_struct, member)
 
 int vdpa_register_device(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From 33b347503f014ebf76257327cbc7001c6b721956 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:00 +0200
Subject: vdpa: Define vdpa mgmt device, ops and a netlink interface

To add one or more VDPA devices, define a management device which
allows adding or removing vdpa device. A management device defines
set of callbacks to manage vdpa devices.

To begin with, it defines add and remove callbacks through which a user
defined vdpa device can be added or removed.

A unique management device is identified by its unique handle identified
by management device name and optionally the bus name.

Hence, introduce routine through which driver can register a
management device and its callback operations for adding and remove
a vdpa device.

Introduce vdpa netlink socket family so that user can query management
device and its attributes.

Example of show vdpa management device which allows creating vdpa device of
networking class (device id = 0x1) of virtio specification 1.1
section 5.1.1.

$ vdpa mgmtdev show
vdpasim_net:
  supported_classes:
    net

Example of showing vdpa management device in JSON format.

$ vdpa mgmtdev show -jp
{
    "show": {
        "vdpasim_net": {
            "supported_classes": [ "net" ]
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Including a bugfix:

vpda: correctly size vdpa_nl_policy

We need to ensure last entry of vdpa_nl_policy[]
is zero, otherwise out-of-bounds access is hurting us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Eli Cohen <elic@nvidia.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20210210134911.4119555-1-eric.dumazet@gmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/Kconfig      |   1 +
 drivers/vdpa/vdpa.c       | 213 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/vdpa.h      |  31 +++++++
 include/uapi/linux/vdpa.h |  31 +++++++
 4 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/vdpa.h

(limited to 'include')

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 92a6396f8a73..ffd1e098bfd2 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menuconfig VDPA
 	tristate "vDPA drivers"
+	depends on NET
 	help
 	  Enable this module to support vDPA device that uses a
 	  datapath which complies with virtio specifications with
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 7414bbd9057c..586e66b7f671 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -11,11 +11,17 @@
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/vdpa.h>
+#include <uapi/linux/vdpa.h>
+#include <net/genetlink.h>
+#include <linux/mod_devicetable.h>
 
+static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level operations. */
 static DEFINE_MUTEX(vdpa_dev_mutex);
 static DEFINE_IDA(vdpa_index_ida);
 
+static struct genl_family vdpa_nl_family;
+
 static int vdpa_dev_probe(struct device *d)
 {
 	struct vdpa_device *vdev = dev_to_vdpa(d);
@@ -195,13 +201,218 @@ void vdpa_unregister_driver(struct vdpa_driver *drv)
 }
 EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
 
+/**
+ * vdpa_mgmtdev_register - register a vdpa management device
+ *
+ * @mdev: Pointer to vdpa management device
+ * vdpa_mgmtdev_register() register a vdpa management device which supports
+ * vdpa device management.
+ */
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
+{
+	if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&mdev->list);
+	mutex_lock(&vdpa_dev_mutex);
+	list_add_tail(&mdev->list, &mdev_head);
+	mutex_unlock(&vdpa_dev_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
+
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
+{
+	mutex_lock(&vdpa_dev_mutex);
+	list_del(&mdev->list);
+	mutex_unlock(&vdpa_dev_mutex);
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
+
+static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev,
+				 const char *busname, const char *devname)
+{
+	/* Bus name is optional for simulated management device, so ignore the
+	 * device with bus if bus attribute is provided.
+	 */
+	if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus))
+		return false;
+
+	if (!busname && strcmp(dev_name(mdev->device), devname) == 0)
+		return true;
+
+	if (busname && (strcmp(mdev->device->bus->name, busname) == 0) &&
+	    (strcmp(dev_name(mdev->device), devname) == 0))
+		return true;
+
+	return false;
+}
+
+static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *busname = NULL;
+	const char *devname;
+
+	if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+	devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]);
+	if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME])
+		busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]);
+
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (mgmtdev_handle_match(mdev, busname, devname))
+			return mdev;
+	}
+	return ERR_PTR(-ENODEV);
+}
+
+static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev)
+{
+	if (mdev->device->bus &&
+	    nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg,
+			     u32 portid, u32 seq, int flags)
+{
+	u64 supported_classes = 0;
+	void *hdr;
+	int i = 0;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+	err = vdpa_nl_mgmtdev_handle_fill(msg, mdev);
+	if (err)
+		goto msg_err;
+
+	while (mdev->id_table[i].device) {
+		supported_classes |= BIT(mdev->id_table[i].device);
+		i++;
+	}
+
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,
+			      supported_classes, VDPA_ATTR_UNSPEC)) {
+		err = -EMSGSIZE;
+		goto msg_err;
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		mutex_unlock(&vdpa_dev_mutex);
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device");
+		err = PTR_ERR(mdev);
+		goto out;
+	}
+
+	err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0);
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		goto out;
+	err = genlmsg_reply(msg, info);
+	return err;
+
+out:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int
+vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_mgmt_dev *mdev;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&vdpa_dev_mutex);
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
+	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+};
+
+static const struct genl_ops vdpa_nl_ops[] = {
+	{
+		.cmd = VDPA_CMD_MGMTDEV_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
+		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
+	},
+};
+
+static struct genl_family vdpa_nl_family __ro_after_init = {
+	.name = VDPA_GENL_NAME,
+	.version = VDPA_GENL_VERSION,
+	.maxattr = VDPA_ATTR_MAX,
+	.policy = vdpa_nl_policy,
+	.netnsok = false,
+	.module = THIS_MODULE,
+	.ops = vdpa_nl_ops,
+	.n_ops = ARRAY_SIZE(vdpa_nl_ops),
+};
+
 static int vdpa_init(void)
 {
-	return bus_register(&vdpa_bus);
+	int err;
+
+	err = bus_register(&vdpa_bus);
+	if (err)
+		return err;
+	err = genl_register_family(&vdpa_nl_family);
+	if (err)
+		goto err;
+	return 0;
+
+err:
+	bus_unregister(&vdpa_bus);
+	return err;
 }
 
 static void __exit vdpa_exit(void)
 {
+	genl_unregister_family(&vdpa_nl_family);
 	bus_unregister(&vdpa_bus);
 	ida_destroy(&vdpa_index_ida);
 }
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 5700baa22356..6b8b4222bca6 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -35,6 +35,8 @@ struct vdpa_vq_state {
 	u16	avail_index;
 };
 
+struct vdpa_mgmt_dev;
+
 /**
  * vDPA device - representation of a vDPA device
  * @dev: underlying device
@@ -335,4 +337,33 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
 	ops->get_config(vdev, offset, buf, len);
 }
 
+/**
+ * vdpa_mgmtdev_ops - vdpa device ops
+ * @dev_add:	Add a vdpa device using alloc and register
+ *		@mdev: parent device to use for device addition
+ *		@name: name of the new vdpa device
+ *		Driver need to add a new device using _vdpa_register_device()
+ *		after fully initializing the vdpa device. Driver must return 0
+ *		on success or appropriate error code.
+ * @dev_del:	Remove a vdpa device using unregister
+ *		@mdev: parent device to use for device removal
+ *		@dev: vdpa device to remove
+ *		Driver need to remove the specified device by calling
+ *		_vdpa_unregister_device().
+ */
+struct vdpa_mgmtdev_ops {
+	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
+	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
+};
+
+struct vdpa_mgmt_dev {
+	struct device *device;
+	const struct vdpa_mgmtdev_ops *ops;
+	const struct virtio_device_id *id_table; /* supported ids */
+	struct list_head list;
+};
+
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev);
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev);
+
 #endif /* _LINUX_VDPA_H */
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
new file mode 100644
index 000000000000..d44d82e567b1
--- /dev/null
+++ b/include/uapi/linux/vdpa.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * vdpa device management interface
+ * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef _UAPI_LINUX_VDPA_H_
+#define _UAPI_LINUX_VDPA_H_
+
+#define VDPA_GENL_NAME "vdpa"
+#define VDPA_GENL_VERSION 0x1
+
+enum vdpa_command {
+	VDPA_CMD_UNSPEC,
+	VDPA_CMD_MGMTDEV_NEW,
+	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+};
+
+enum vdpa_attr {
+	VDPA_ATTR_UNSPEC,
+
+	/* bus name (optional) + dev name together make the parent device handle */
+	VDPA_ATTR_MGMTDEV_BUS_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
+
+	/* new attributes must be added above here */
+	VDPA_ATTR_MAX,
+};
+
+#endif
-- 
cgit v1.2.3


From 903f7bcaedb84ca47998e609015a34ddde93742e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:01 +0200
Subject: vdpa: Enable a user to add and delete a vdpa device

Add the ability to add and delete a vdpa device.

Examples:
Create a vdpa device of type network named "foo2" from
the management device vdpasim:

$ vdpa dev add mgmtdev vdpasim_net name foo2

Delete the vdpa device after its use:
$ vdpa dev del foo2

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-5-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 143 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/vdpa.h      |   6 ++
 include/uapi/linux/vdpa.h |   4 ++
 3 files changed, 143 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 586e66b7f671..f1228189814f 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -136,6 +136,37 @@ static int vdpa_name_match(struct device *dev, const void *data)
 	return (strcmp(dev_name(&vdev->dev), data) == 0);
 }
 
+static int __vdpa_register_device(struct vdpa_device *vdev)
+{
+	struct device *dev;
+
+	lockdep_assert_held(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+	if (dev) {
+		put_device(dev);
+		return -EEXIST;
+	}
+	return device_add(&vdev->dev);
+}
+
+/**
+ * _vdpa_register_device - register a vDPA device with vdpa lock held
+ * Caller must have a succeed call of vdpa_alloc_device() before.
+ * Caller must invoke this routine in the management device dev_add()
+ * callback after setting up valid mgmtdev for this vdpa device.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add device to vDPA bus
+ */
+int _vdpa_register_device(struct vdpa_device *vdev)
+{
+	if (!vdev->mdev)
+		return -EINVAL;
+
+	return __vdpa_register_device(vdev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_register_device);
+
 /**
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
@@ -145,24 +176,29 @@ static int vdpa_name_match(struct device *dev, const void *data)
  */
 int vdpa_register_device(struct vdpa_device *vdev)
 {
-	struct device *dev;
 	int err;
 
 	mutex_lock(&vdpa_dev_mutex);
-	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
-	if (dev) {
-		put_device(dev);
-		err = -EEXIST;
-		goto name_err;
-	}
-
-	err = device_add(&vdev->dev);
-name_err:
+	err = __vdpa_register_device(vdev);
 	mutex_unlock(&vdpa_dev_mutex);
 	return err;
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
 
+/**
+ * _vdpa_unregister_device - unregister a vDPA device
+ * Caller must invoke this routine as part of management device dev_del()
+ * callback.
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void _vdpa_unregister_device(struct vdpa_device *vdev)
+{
+	lockdep_assert_held(&vdpa_dev_mutex);
+	WARN_ON(!vdev->mdev);
+	device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_unregister_device);
+
 /**
  * vdpa_unregister_device - unregister a vDPA device
  * @vdev: the vdpa device to be unregisted from vDPA bus
@@ -221,10 +257,25 @@ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
 
+static int vdpa_match_remove(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_mgmt_dev *mdev = vdev->mdev;
+
+	if (mdev == data)
+		mdev->ops->dev_del(mdev, vdev);
+	return 0;
+}
+
 void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
 {
 	mutex_lock(&vdpa_dev_mutex);
+
 	list_del(&mdev->list);
+
+	/* Filter out all the entries belong to this management device and delete it. */
+	bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove);
+
 	mutex_unlock(&vdpa_dev_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
@@ -368,9 +419,69 @@ out:
 	return msg->len;
 }
 
+static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device");
+		err = PTR_ERR(mdev);
+		goto err;
+	}
+
+	err = mdev->ops->dev_add(mdev, name);
+err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct vdpa_device *vdev;
+	struct device *dev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto dev_err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user");
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	mdev = vdev->mdev;
+	mdev->ops->dev_del(mdev, vdev);
+mdev_err:
+	put_device(dev);
+dev_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+	[VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
 };
 
 static const struct genl_ops vdpa_nl_ops[] = {
@@ -380,6 +491,18 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
 		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_NEW,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_add_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = VDPA_CMD_DEV_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_del_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6b8b4222bca6..4ab5494503a8 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -45,6 +45,8 @@ struct vdpa_mgmt_dev;
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
  * @nvqs: maximum number of supported virtqueues
+ * @mdev: management device pointer; caller must setup when registering device as part
+ *	  of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
  */
 struct vdpa_device {
 	struct device dev;
@@ -53,6 +55,7 @@ struct vdpa_device {
 	unsigned int index;
 	bool features_valid;
 	int nvqs;
+	struct vdpa_mgmt_dev *mdev;
 };
 
 /**
@@ -260,6 +263,9 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 int vdpa_register_device(struct vdpa_device *vdev);
 void vdpa_unregister_device(struct vdpa_device *vdev);
 
+int _vdpa_register_device(struct vdpa_device *vdev);
+void _vdpa_unregister_device(struct vdpa_device *vdev);
+
 /**
  * vdpa_driver - operations for a vDPA driver
  * @driver: underlying device driver
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index d44d82e567b1..bb4a1f00eb1c 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -14,6 +14,8 @@ enum vdpa_command {
 	VDPA_CMD_UNSPEC,
 	VDPA_CMD_MGMTDEV_NEW,
 	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+	VDPA_CMD_DEV_NEW,
+	VDPA_CMD_DEV_DEL,
 };
 
 enum vdpa_attr {
@@ -24,6 +26,8 @@ enum vdpa_attr {
 	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
 	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
 
+	VDPA_ATTR_DEV_NAME,			/* string */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3


From bc0d90ee021f1baecd6aaa010d787eb373aa74dd Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:02 +0200
Subject: vdpa: Enable user to query vdpa device info

Enable user to query vdpa device information.

$ vdpa dev add mgmtdev vdpasim_net name foo2

Show the newly created vdpa device by its name:
$ vdpa dev show foo2
foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256

$ vdpa dev show foo2 -jp
{
    "dev": {
        "foo2": {
            "type": "network",
            "mgmtdev": "vdpasim_net",
            "vendor_id": 0,
            "max_vqs": 2,
            "max_vq_size": 256
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-6-parav@nvidia.com

Including a memory leak fix:

Link: https://lore.kernel.org/r/20210217060614.59561-1-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vdpa.h |   5 ++
 2 files changed, 136 insertions(+)

(limited to 'include')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index f1228189814f..da67f07e24fd 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -478,6 +478,131 @@ dev_err:
 	return err;
 }
 
+static int
+vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq,
+	      int flags, struct netlink_ext_ack *extack)
+{
+	u16 max_vq_size;
+	u32 device_id;
+	u32 vendor_id;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev);
+	if (err)
+		goto msg_err;
+
+	device_id = vdev->config->get_device_id(vdev);
+	vendor_id = vdev->config->get_vendor_id(vdev);
+	max_vq_size = vdev->config->get_vq_num_max(vdev);
+
+	err = -EMSGSIZE;
+	if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev)))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id))
+		goto msg_err;
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs))
+		goto msg_err;
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size))
+		goto msg_err;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_device *vdev;
+	struct sk_buff *msg;
+	const char *devname;
+	struct device *dev;
+	int err;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack);
+	if (!err)
+		err = genlmsg_reply(msg, info);
+mdev_err:
+	put_device(dev);
+err:
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		nlmsg_free(msg);
+	return err;
+}
+
+struct vdpa_dev_dump_info {
+	struct sk_buff *msg;
+	struct netlink_callback *cb;
+	int start_idx;
+	int idx;
+};
+
+static int vdpa_dev_dump(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_dev_dump_info *info = data;
+	int err;
+
+	if (!vdev->mdev)
+		return 0;
+	if (info->idx < info->start_idx) {
+		info->idx++;
+		return 0;
+	}
+	err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid,
+			    info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack);
+	if (err)
+		return err;
+
+	info->idx++;
+	return 0;
+}
+
+static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_dev_dump_info info;
+
+	info.msg = msg;
+	info.cb = cb;
+	info.start_idx = cb->args[0];
+	info.idx = 0;
+
+	mutex_lock(&vdpa_dev_mutex);
+	bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump);
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = info.idx;
+	return msg->len;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
@@ -503,6 +628,12 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_dev_del_set_doit,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_get_doit,
+		.dumpit = vdpa_nl_cmd_dev_get_dumpit,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index bb4a1f00eb1c..66a41e4ec163 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -16,6 +16,7 @@ enum vdpa_command {
 	VDPA_CMD_MGMTDEV_GET,		/* can dump */
 	VDPA_CMD_DEV_NEW,
 	VDPA_CMD_DEV_DEL,
+	VDPA_CMD_DEV_GET,		/* can dump */
 };
 
 enum vdpa_attr {
@@ -27,6 +28,10 @@ enum vdpa_attr {
 	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
 
 	VDPA_ATTR_DEV_NAME,			/* string */
+	VDPA_ATTR_DEV_ID,			/* u32 */
+	VDPA_ATTR_DEV_VENDOR_ID,		/* u32 */
+	VDPA_ATTR_DEV_MAX_VQS,			/* u32 */
+	VDPA_ATTR_DEV_MAX_VQ_SIZE,		/* u16 */
 
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
-- 
cgit v1.2.3


From fd502729fbbf6a76fdb7acae4506486bfbb7c4f6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 4 Jan 2021 14:55:00 +0800
Subject: virtio-pci: introduce modern device module

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210104065503.199631-17-jasowang@redhat.com

Including a bugfix:

virtio: don't prompt CONFIG_VIRTIO_PCI_MODERN

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Fixes: 86b87c9d858b6 ("virtio-pci: introduce modern device module")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20210223061905.422659-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/Kconfig                 |   9 +
 drivers/virtio/Makefile                |   1 +
 drivers/virtio/virtio_pci_common.h     |  27 +-
 drivers/virtio/virtio_pci_modern.c     | 617 ---------------------------------
 drivers/virtio/virtio_pci_modern_dev.c | 599 ++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_modern.h      | 111 ++++++
 6 files changed, 721 insertions(+), 643 deletions(-)
 create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
 create mode 100644 include/linux/virtio_pci_modern.h

(limited to 'include')

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 7b41130d3f35..ce1b3f6ec325 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -12,6 +12,14 @@ config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
 	  This option is selected if the architecture may need to enforce
 	  VIRTIO_F_ACCESS_PLATFORM
 
+config VIRTIO_PCI_LIB
+	tristate
+	help
+	  Modern PCI device implementation. This module implements the
+	  basic probe and control for devices which are based on modern
+	  PCI device with possible vendor specific extensions. Any
+	  module that selects this module must depend on PCI.
+
 menuconfig VIRTIO_MENU
 	bool "Virtio drivers"
 	default y
@@ -21,6 +29,7 @@ if VIRTIO_MENU
 config VIRTIO_PCI
 	tristate "PCI driver for virtio devices"
 	depends on PCI
+	select VIRTIO_PCI_LIB
 	select VIRTIO
 	help
 	  This driver provides support for virtio based paravirtual device
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 591e6f72aa54..699bbea0465f 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
+obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index f35ff5b6b467..beec047a8f8d 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -25,6 +25,7 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/virtio_pci.h>
+#include <linux/virtio_pci_modern.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 
@@ -39,32 +40,6 @@ struct virtio_pci_vq_info {
 	unsigned msix_vector;
 };
 
-struct virtio_pci_modern_device {
-	struct pci_dev *pci_dev;
-
-	struct virtio_pci_common_cfg __iomem *common;
-	/* Device-specific data (non-legacy mode)  */
-	void __iomem *device;
-	/* Base of vq notifications (non-legacy mode). */
-	void __iomem *notify_base;
-	/* Where to read and clear interrupt */
-	u8 __iomem *isr;
-
-	/* So we can sanity-check accesses. */
-	size_t notify_len;
-	size_t device_len;
-
-	/* Capability for when we need to map notifications per-vq. */
-	int notify_map_cap;
-
-	/* Multiply queue_notify_off by this value. (non-legacy mode). */
-	u32 notify_offset_multiplier;
-
-	int modern_bars;
-
-	struct virtio_device_id id;
-};
-
 /* Our device structure */
 struct virtio_pci_device {
 	struct virtio_device vdev;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index a5e3a5e40323..fbd4ebc00eb6 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -19,158 +19,6 @@
 #define VIRTIO_RING_NO_LEGACY
 #include "virtio_pci_common.h"
 
-/*
- * Type-safe wrappers for io accesses.
- * Use these to enforce at compile time the following spec requirement:
- *
- * The driver MUST access each field using the “natural” access
- * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses
- * for 16-bit fields and 8-bit accesses for 8-bit fields.
- */
-static inline u8 vp_ioread8(const u8 __iomem *addr)
-{
-	return ioread8(addr);
-}
-static inline u16 vp_ioread16 (const __le16 __iomem *addr)
-{
-	return ioread16(addr);
-}
-
-static inline u32 vp_ioread32(const __le32 __iomem *addr)
-{
-	return ioread32(addr);
-}
-
-static inline void vp_iowrite8(u8 value, u8 __iomem *addr)
-{
-	iowrite8(value, addr);
-}
-
-static inline void vp_iowrite16(u16 value, __le16 __iomem *addr)
-{
-	iowrite16(value, addr);
-}
-
-static inline void vp_iowrite32(u32 value, __le32 __iomem *addr)
-{
-	iowrite32(value, addr);
-}
-
-static void vp_iowrite64_twopart(u64 val,
-				 __le32 __iomem *lo, __le32 __iomem *hi)
-{
-	vp_iowrite32((u32)val, lo);
-	vp_iowrite32(val >> 32, hi);
-}
-
-/*
- * vp_modern_map_capability - map a part of virtio pci capability
- * @mdev: the modern virtio-pci device
- * @off: offset of the capability
- * @minlen: minimal length of the capability
- * @align: align requirement
- * @start: start from the capability
- * @size: map size
- * @len: the length that is actually mapped
- *
- * Returns the io address of for the part of the capability
- */
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-				       size_t minlen,
-				       u32 align,
-				       u32 start, u32 size,
-				       size_t *len)
-{
-	struct pci_dev *dev = mdev->pci_dev;
-	u8 bar;
-	u32 offset, length;
-	void __iomem *p;
-
-	pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
-						 bar),
-			     &bar);
-	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
-			     &offset);
-	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
-			      &length);
-
-	if (length <= start) {
-		dev_err(&dev->dev,
-			"virtio_pci: bad capability len %u (>%u expected)\n",
-			length, start);
-		return NULL;
-	}
-
-	if (length - start < minlen) {
-		dev_err(&dev->dev,
-			"virtio_pci: bad capability len %u (>=%zu expected)\n",
-			length, minlen);
-		return NULL;
-	}
-
-	length -= start;
-
-	if (start + offset < offset) {
-		dev_err(&dev->dev,
-			"virtio_pci: map wrap-around %u+%u\n",
-			start, offset);
-		return NULL;
-	}
-
-	offset += start;
-
-	if (offset & (align - 1)) {
-		dev_err(&dev->dev,
-			"virtio_pci: offset %u not aligned to %u\n",
-			offset, align);
-		return NULL;
-	}
-
-	if (length > size)
-		length = size;
-
-	if (len)
-		*len = length;
-
-	if (minlen + offset < minlen ||
-	    minlen + offset > pci_resource_len(dev, bar)) {
-		dev_err(&dev->dev,
-			"virtio_pci: map virtio %zu@%u "
-			"out of range on bar %i length %lu\n",
-			minlen, offset,
-			bar, (unsigned long)pci_resource_len(dev, bar));
-		return NULL;
-	}
-
-	p = pci_iomap_range(dev, bar, offset, length);
-	if (!p)
-		dev_err(&dev->dev,
-			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
-			length, offset, bar);
-	return p;
-}
-
-/*
- * vp_modern_get_features - get features from device
- * @mdev: the modern virtio-pci device
- *
- * Returns the features read from the device
- */
-static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	u64 features;
-
-	vp_iowrite32(0, &cfg->device_feature_select);
-	features = vp_ioread32(&cfg->device_feature);
-	vp_iowrite32(1, &cfg->device_feature_select);
-	features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
-
-	return features;
-}
-
-/* virtio config->get_features() implementation */
 static u64 vp_get_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -188,149 +36,6 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features)
 		__virtio_set_bit(vdev, VIRTIO_F_SR_IOV);
 }
 
-/*
- * vp_modern_set_features - set features to device
- * @mdev: the modern virtio-pci device
- * @features: the features set to device
- */
-static void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
-				   u64 features)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite32(0, &cfg->guest_feature_select);
-	vp_iowrite32((u32)features, &cfg->guest_feature);
-	vp_iowrite32(1, &cfg->guest_feature_select);
-	vp_iowrite32(features >> 32, &cfg->guest_feature);
-}
-
-/*
- * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
- * @mdev: the modern virtio-pci device
- * @index: queue index
- * @vector: the config vector
- *
- * Returns the config vector read from the device
- */
-static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
-				  u16 index, u16 vector)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite16(index, &cfg->queue_select);
-	vp_iowrite16(vector, &cfg->queue_msix_vector);
-	/* Flush the write out to device */
-	return vp_ioread16(&cfg->queue_msix_vector);
-}
-
-/*
- * vp_modern_queue_address - set the virtqueue address
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @desc_addr: address of the descriptor area
- * @driver_addr: address of the driver area
- * @device_addr: address of the device area
- */
-static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
-				    u16 index, u64 desc_addr, u64 driver_addr,
-				    u64 device_addr)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite16(index, &cfg->queue_select);
-
-	vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo,
-			     &cfg->queue_desc_hi);
-	vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo,
-			     &cfg->queue_avail_hi);
-	vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo,
-			     &cfg->queue_used_hi);
-}
-
-/*
- * vp_modern_set_queue_enable - enable a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @enable: whether the virtqueue is enable or not
- */
-static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
-				       u16 index, bool enable)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-	vp_iowrite16(enable, &mdev->common->queue_enable);
-}
-
-/*
- * vp_modern_get_queue_enable - enable a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns whether a virtqueue is enabled or not
- */
-static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
-				       u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_enable);
-}
-
-/*
- * vp_modern_set_queue_size - set size for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @size: the size of the virtqueue
- */
-static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
-				     u16 index, u16 size)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-	vp_iowrite16(size, &mdev->common->queue_size);
-
-}
-
-/*
- * vp_modern_get_queue_size - get size for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns the size of the virtqueue
- */
-static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
-				    u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_size);
-
-}
-
-/*
- * vp_modern_get_num_queues - get the number of virtqueues
- * @mdev: the modern virtio-pci device
- *
- * Returns the number of virtqueues
- */
-static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
-{
-	return vp_ioread16(&mdev->common->num_queues);
-}
-
-/*
- * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns the notification offset for a virtqueue
- */
-static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-					  u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_notify_off);
-}
-
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -429,19 +134,6 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 	}
 }
 
-/*
- * vp_modern_generation - get the device genreation
- * @mdev: the modern virtio-pci device
- *
- * Returns the genreation read from device
- */
-static u32 vp_modern_generation(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	return vp_ioread8(&cfg->config_generation);
-}
-
 static u32 vp_generation(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -449,19 +141,6 @@ static u32 vp_generation(struct virtio_device *vdev)
 	return vp_modern_generation(&vp_dev->mdev);
 }
 
-/*
- * vp_modern_get_status - get the device status
- * @mdev: the modern virtio-pci device
- *
- * Returns the status read from device
- */
-static u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	return vp_ioread8(&cfg->device_status);
-}
-
 /* config->{get,set}_status() implementations */
 static u8 vp_get_status(struct virtio_device *vdev)
 {
@@ -470,19 +149,6 @@ static u8 vp_get_status(struct virtio_device *vdev)
 	return vp_modern_get_status(&vp_dev->mdev);
 }
 
-/*
- * vp_modern_set_status - set status to device
- * @mdev: the modern virtio-pci device
- * @status: the status set to device
- */
-static void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
-				 u8 status)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite8(status, &cfg->device_status);
-}
-
 static void vp_set_status(struct virtio_device *vdev, u8 status)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -510,25 +176,6 @@ static void vp_reset(struct virtio_device *vdev)
 	vp_synchronize_vectors(vdev);
 }
 
-/*
- * vp_modern_config_vector - set the vector for config interrupt
- * @mdev: the modern virtio-pci device
- * @vector: the config vector
- *
- * Returns the config vector read from the device
- */
-static u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
-				   u16 vector)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	/* Setup the vector used for configuration events */
-	vp_iowrite16(vector, &cfg->msix_config);
-	/* Verify we had enough resources to assign the vector */
-	/* Will also flush the write out to device */
-	return vp_ioread16(&cfg->msix_config);
-}
-
 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 {
 	return vp_modern_config_vector(&vp_dev->mdev, vector);
@@ -789,253 +436,6 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.get_shm_region  = vp_get_shm_region,
 };
 
-/**
- * virtio_pci_find_capability - walk capabilities to find device info.
- * @dev: the pci device
- * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
- * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
- * @bars: the bitmask of BARs
- *
- * Returns offset of the capability, or 0.
- */
-static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
-					     u32 ioresource_types, int *bars)
-{
-	int pos;
-
-	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
-	     pos > 0;
-	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
-		u8 type, bar;
-		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
-							 cfg_type),
-				     &type);
-		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
-							 bar),
-				     &bar);
-
-		/* Ignore structures with reserved BAR values */
-		if (bar > 0x5)
-			continue;
-
-		if (type == cfg_type) {
-			if (pci_resource_len(dev, bar) &&
-			    pci_resource_flags(dev, bar) & ioresource_types) {
-				*bars |= (1 << bar);
-				return pos;
-			}
-		}
-	}
-	return 0;
-}
-
-/* This is part of the ABI.  Don't screw with it. */
-static inline void check_offsets(void)
-{
-	/* Note: disk space was harmed in compilation of this function. */
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
-		     offsetof(struct virtio_pci_cap, cap_vndr));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
-		     offsetof(struct virtio_pci_cap, cap_next));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
-		     offsetof(struct virtio_pci_cap, cap_len));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
-		     offsetof(struct virtio_pci_cap, cfg_type));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
-		     offsetof(struct virtio_pci_cap, bar));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
-		     offsetof(struct virtio_pci_cap, offset));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
-		     offsetof(struct virtio_pci_cap, length));
-	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
-		     offsetof(struct virtio_pci_notify_cap,
-			      notify_off_multiplier));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
-		     offsetof(struct virtio_pci_common_cfg,
-			      device_feature_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
-		     offsetof(struct virtio_pci_common_cfg, device_feature));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
-		     offsetof(struct virtio_pci_common_cfg,
-			      guest_feature_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
-		     offsetof(struct virtio_pci_common_cfg, guest_feature));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
-		     offsetof(struct virtio_pci_common_cfg, msix_config));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
-		     offsetof(struct virtio_pci_common_cfg, num_queues));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
-		     offsetof(struct virtio_pci_common_cfg, device_status));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
-		     offsetof(struct virtio_pci_common_cfg, config_generation));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
-		     offsetof(struct virtio_pci_common_cfg, queue_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
-		     offsetof(struct virtio_pci_common_cfg, queue_size));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
-		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
-		     offsetof(struct virtio_pci_common_cfg, queue_enable));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
-		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
-}
-
-/*
- * vp_modern_probe: probe the modern virtio pci device, note that the
- * caller is required to enable PCI device before calling this function.
- * @mdev: the modern virtio-pci device
- *
- * Return 0 on succeed otherwise fail
- */
-static int vp_modern_probe(struct virtio_pci_modern_device *mdev)
-{
-	struct pci_dev *pci_dev = mdev->pci_dev;
-	int err, common, isr, notify, device;
-	u32 notify_length;
-	u32 notify_offset;
-
-	check_offsets();
-
-	mdev->pci_dev = pci_dev;
-
-	/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
-	if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
-		return -ENODEV;
-
-	if (pci_dev->device < 0x1040) {
-		/* Transitional devices: use the PCI subsystem device id as
-		 * virtio device id, same as legacy driver always did.
-		 */
-		mdev->id.device = pci_dev->subsystem_device;
-	} else {
-		/* Modern devices: simply use PCI device id, but start from 0x1040. */
-		mdev->id.device = pci_dev->device - 0x1040;
-	}
-	mdev->id.vendor = pci_dev->subsystem_vendor;
-
-	/* check for a common config: if not, use legacy mode (bar 0). */
-	common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-	if (!common) {
-		dev_info(&pci_dev->dev,
-			 "virtio_pci: leaving for legacy driver\n");
-		return -ENODEV;
-	}
-
-	/* If common is there, these should be too... */
-	isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
-					 IORESOURCE_IO | IORESOURCE_MEM,
-					 &mdev->modern_bars);
-	notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-	if (!isr || !notify) {
-		dev_err(&pci_dev->dev,
-			"virtio_pci: missing capabilities %i/%i/%i\n",
-			common, isr, notify);
-		return -EINVAL;
-	}
-
-	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
-	if (err)
-		err = dma_set_mask_and_coherent(&pci_dev->dev,
-						DMA_BIT_MASK(32));
-	if (err)
-		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
-
-	/* Device capability is only mandatory for devices that have
-	 * device-specific configuration.
-	 */
-	device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-
-	err = pci_request_selected_regions(pci_dev, mdev->modern_bars,
-					   "virtio-pci-modern");
-	if (err)
-		return err;
-
-	err = -EINVAL;
-	mdev->common = vp_modern_map_capability(mdev, common,
-				      sizeof(struct virtio_pci_common_cfg), 4,
-				      0, sizeof(struct virtio_pci_common_cfg),
-				      NULL);
-	if (!mdev->common)
-		goto err_map_common;
-	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
-					     0, 1,
-					     NULL);
-	if (!mdev->isr)
-		goto err_map_isr;
-
-	/* Read notify_off_multiplier from config space. */
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						notify_off_multiplier),
-			      &mdev->notify_offset_multiplier);
-	/* Read notify length and offset from config space. */
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						cap.length),
-			      &notify_length);
-
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						cap.offset),
-			      &notify_offset);
-
-	/* We don't know how many VQs we'll map, ahead of the time.
-	 * If notify length is small, map it all now.
-	 * Otherwise, map each VQ individually later.
-	 */
-	if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
-		mdev->notify_base = vp_modern_map_capability(mdev, notify,
-							     2, 2,
-							     0, notify_length,
-							     &mdev->notify_len);
-		if (!mdev->notify_base)
-			goto err_map_notify;
-	} else {
-		mdev->notify_map_cap = notify;
-	}
-
-	/* Again, we don't know how much we should map, but PAGE_SIZE
-	 * is more than enough for all existing devices.
-	 */
-	if (device) {
-		mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
-							0, PAGE_SIZE,
-							&mdev->device_len);
-		if (!mdev->device)
-			goto err_map_device;
-	}
-
-	return 0;
-
-err_map_device:
-	if (mdev->notify_base)
-		pci_iounmap(pci_dev, mdev->notify_base);
-err_map_notify:
-	pci_iounmap(pci_dev, mdev->isr);
-err_map_isr:
-	pci_iounmap(pci_dev, mdev->common);
-err_map_common:
-	return err;
-}
-
 /* the PCI probing function */
 int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 {
@@ -1063,23 +463,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 	return 0;
 }
 
-/*
- * vp_modern_probe: remove and cleanup the modern virtio pci device
- * @mdev: the modern virtio-pci device
- */
-static void vp_modern_remove(struct virtio_pci_modern_device *mdev)
-{
-	struct pci_dev *pci_dev = mdev->pci_dev;
-
-	if (mdev->device)
-		pci_iounmap(pci_dev, mdev->device);
-	if (mdev->notify_base)
-		pci_iounmap(pci_dev, mdev->notify_base);
-	pci_iounmap(pci_dev, mdev->isr);
-	pci_iounmap(pci_dev, mdev->common);
-	pci_release_selected_regions(pci_dev, mdev->modern_bars);
-}
-
 void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev)
 {
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
new file mode 100644
index 000000000000..cbd667496bb1
--- /dev/null
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/virtio_pci_modern.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+/*
+ * vp_modern_map_capability - map a part of virtio pci capability
+ * @mdev: the modern virtio-pci device
+ * @off: offset of the capability
+ * @minlen: minimal length of the capability
+ * @align: align requirement
+ * @start: start from the capability
+ * @size: map size
+ * @len: the length that is actually mapped
+ *
+ * Returns the io address of for the part of the capability
+ */
+void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+				       size_t minlen,
+				       u32 align,
+				       u32 start, u32 size,
+				       size_t *len)
+{
+	struct pci_dev *dev = mdev->pci_dev;
+	u8 bar;
+	u32 offset, length;
+	void __iomem *p;
+
+	pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
+						 bar),
+			     &bar);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
+			     &offset);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
+			      &length);
+
+	if (length <= start) {
+		dev_err(&dev->dev,
+			"virtio_pci: bad capability len %u (>%u expected)\n",
+			length, start);
+		return NULL;
+	}
+
+	if (length - start < minlen) {
+		dev_err(&dev->dev,
+			"virtio_pci: bad capability len %u (>=%zu expected)\n",
+			length, minlen);
+		return NULL;
+	}
+
+	length -= start;
+
+	if (start + offset < offset) {
+		dev_err(&dev->dev,
+			"virtio_pci: map wrap-around %u+%u\n",
+			start, offset);
+		return NULL;
+	}
+
+	offset += start;
+
+	if (offset & (align - 1)) {
+		dev_err(&dev->dev,
+			"virtio_pci: offset %u not aligned to %u\n",
+			offset, align);
+		return NULL;
+	}
+
+	if (length > size)
+		length = size;
+
+	if (len)
+		*len = length;
+
+	if (minlen + offset < minlen ||
+	    minlen + offset > pci_resource_len(dev, bar)) {
+		dev_err(&dev->dev,
+			"virtio_pci: map virtio %zu@%u "
+			"out of range on bar %i length %lu\n",
+			minlen, offset,
+			bar, (unsigned long)pci_resource_len(dev, bar));
+		return NULL;
+	}
+
+	p = pci_iomap_range(dev, bar, offset, length);
+	if (!p)
+		dev_err(&dev->dev,
+			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
+			length, offset, bar);
+	return p;
+}
+EXPORT_SYMBOL_GPL(vp_modern_map_capability);
+
+/**
+ * virtio_pci_find_capability - walk capabilities to find device info.
+ * @dev: the pci device
+ * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
+ * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
+ * @bars: the bitmask of BARs
+ *
+ * Returns offset of the capability, or 0.
+ */
+static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
+					     u32 ioresource_types, int *bars)
+{
+	int pos;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 type, bar;
+		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+							 cfg_type),
+				     &type);
+		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+							 bar),
+				     &bar);
+
+		/* Ignore structures with reserved BAR values */
+		if (bar > 0x5)
+			continue;
+
+		if (type == cfg_type) {
+			if (pci_resource_len(dev, bar) &&
+			    pci_resource_flags(dev, bar) & ioresource_types) {
+				*bars |= (1 << bar);
+				return pos;
+			}
+		}
+	}
+	return 0;
+}
+
+/* This is part of the ABI.  Don't screw with it. */
+static inline void check_offsets(void)
+{
+	/* Note: disk space was harmed in compilation of this function. */
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+		     offsetof(struct virtio_pci_cap, cap_vndr));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+		     offsetof(struct virtio_pci_cap, cap_next));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+		     offsetof(struct virtio_pci_cap, cap_len));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
+		     offsetof(struct virtio_pci_cap, cfg_type));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
+		     offsetof(struct virtio_pci_cap, bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+		     offsetof(struct virtio_pci_cap, offset));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+		     offsetof(struct virtio_pci_cap, length));
+	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+		     offsetof(struct virtio_pci_notify_cap,
+			      notify_off_multiplier));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      device_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
+		     offsetof(struct virtio_pci_common_cfg, device_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      guest_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
+		     offsetof(struct virtio_pci_common_cfg, guest_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, msix_config));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
+		     offsetof(struct virtio_pci_common_cfg, num_queues));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
+		     offsetof(struct virtio_pci_common_cfg, device_status));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
+		     offsetof(struct virtio_pci_common_cfg, config_generation));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
+		     offsetof(struct virtio_pci_common_cfg, queue_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_size));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_enable));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
+		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
+}
+
+/*
+ * vp_modern_probe: probe the modern virtio pci device, note that the
+ * caller is required to enable PCI device before calling this function.
+ * @mdev: the modern virtio-pci device
+ *
+ * Return 0 on succeed otherwise fail
+ */
+int vp_modern_probe(struct virtio_pci_modern_device *mdev)
+{
+	struct pci_dev *pci_dev = mdev->pci_dev;
+	int err, common, isr, notify, device;
+	u32 notify_length;
+	u32 notify_offset;
+
+	check_offsets();
+
+	mdev->pci_dev = pci_dev;
+
+	/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
+	if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
+		return -ENODEV;
+
+	if (pci_dev->device < 0x1040) {
+		/* Transitional devices: use the PCI subsystem device id as
+		 * virtio device id, same as legacy driver always did.
+		 */
+		mdev->id.device = pci_dev->subsystem_device;
+	} else {
+		/* Modern devices: simply use PCI device id, but start from 0x1040. */
+		mdev->id.device = pci_dev->device - 0x1040;
+	}
+	mdev->id.vendor = pci_dev->subsystem_vendor;
+
+	/* check for a common config: if not, use legacy mode (bar 0). */
+	common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+	if (!common) {
+		dev_info(&pci_dev->dev,
+			 "virtio_pci: leaving for legacy driver\n");
+		return -ENODEV;
+	}
+
+	/* If common is there, these should be too... */
+	isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
+					 IORESOURCE_IO | IORESOURCE_MEM,
+					 &mdev->modern_bars);
+	notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+	if (!isr || !notify) {
+		dev_err(&pci_dev->dev,
+			"virtio_pci: missing capabilities %i/%i/%i\n",
+			common, isr, notify);
+		return -EINVAL;
+	}
+
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err)
+		err = dma_set_mask_and_coherent(&pci_dev->dev,
+						DMA_BIT_MASK(32));
+	if (err)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
+
+	/* Device capability is only mandatory for devices that have
+	 * device-specific configuration.
+	 */
+	device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+
+	err = pci_request_selected_regions(pci_dev, mdev->modern_bars,
+					   "virtio-pci-modern");
+	if (err)
+		return err;
+
+	err = -EINVAL;
+	mdev->common = vp_modern_map_capability(mdev, common,
+				      sizeof(struct virtio_pci_common_cfg), 4,
+				      0, sizeof(struct virtio_pci_common_cfg),
+				      NULL);
+	if (!mdev->common)
+		goto err_map_common;
+	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
+					     0, 1,
+					     NULL);
+	if (!mdev->isr)
+		goto err_map_isr;
+
+	/* Read notify_off_multiplier from config space. */
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						notify_off_multiplier),
+			      &mdev->notify_offset_multiplier);
+	/* Read notify length and offset from config space. */
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						cap.length),
+			      &notify_length);
+
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						cap.offset),
+			      &notify_offset);
+
+	/* We don't know how many VQs we'll map, ahead of the time.
+	 * If notify length is small, map it all now.
+	 * Otherwise, map each VQ individually later.
+	 */
+	if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
+		mdev->notify_base = vp_modern_map_capability(mdev, notify,
+							     2, 2,
+							     0, notify_length,
+							     &mdev->notify_len);
+		if (!mdev->notify_base)
+			goto err_map_notify;
+	} else {
+		mdev->notify_map_cap = notify;
+	}
+
+	/* Again, we don't know how much we should map, but PAGE_SIZE
+	 * is more than enough for all existing devices.
+	 */
+	if (device) {
+		mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
+							0, PAGE_SIZE,
+							&mdev->device_len);
+		if (!mdev->device)
+			goto err_map_device;
+	}
+
+	return 0;
+
+err_map_device:
+	if (mdev->notify_base)
+		pci_iounmap(pci_dev, mdev->notify_base);
+err_map_notify:
+	pci_iounmap(pci_dev, mdev->isr);
+err_map_isr:
+	pci_iounmap(pci_dev, mdev->common);
+err_map_common:
+	return err;
+}
+EXPORT_SYMBOL_GPL(vp_modern_probe);
+
+/*
+ * vp_modern_probe: remove and cleanup the modern virtio pci device
+ * @mdev: the modern virtio-pci device
+ */
+void vp_modern_remove(struct virtio_pci_modern_device *mdev)
+{
+	struct pci_dev *pci_dev = mdev->pci_dev;
+
+	if (mdev->device)
+		pci_iounmap(pci_dev, mdev->device);
+	if (mdev->notify_base)
+		pci_iounmap(pci_dev, mdev->notify_base);
+	pci_iounmap(pci_dev, mdev->isr);
+	pci_iounmap(pci_dev, mdev->common);
+	pci_release_selected_regions(pci_dev, mdev->modern_bars);
+}
+EXPORT_SYMBOL_GPL(vp_modern_remove);
+
+/*
+ * vp_modern_get_features - get features from device
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the features read from the device
+ */
+u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	u64 features;
+
+	vp_iowrite32(0, &cfg->device_feature_select);
+	features = vp_ioread32(&cfg->device_feature);
+	vp_iowrite32(1, &cfg->device_feature_select);
+	features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
+
+	return features;
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_features);
+
+/*
+ * vp_modern_set_features - set features to device
+ * @mdev: the modern virtio-pci device
+ * @features: the features set to device
+ */
+void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
+			    u64 features)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite32(0, &cfg->guest_feature_select);
+	vp_iowrite32((u32)features, &cfg->guest_feature);
+	vp_iowrite32(1, &cfg->guest_feature_select);
+	vp_iowrite32(features >> 32, &cfg->guest_feature);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_features);
+
+/*
+ * vp_modern_generation - get the device genreation
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the genreation read from device
+ */
+u32 vp_modern_generation(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	return vp_ioread8(&cfg->config_generation);
+}
+EXPORT_SYMBOL_GPL(vp_modern_generation);
+
+/*
+ * vp_modern_get_status - get the device status
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the status read from device
+ */
+u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	return vp_ioread8(&cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_status);
+
+/*
+ * vp_modern_set_status - set status to device
+ * @mdev: the modern virtio-pci device
+ * @status: the status set to device
+ */
+void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
+				 u8 status)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite8(status, &cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_status);
+
+/*
+ * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+			   u16 index, u16 vector)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite16(index, &cfg->queue_select);
+	vp_iowrite16(vector, &cfg->queue_msix_vector);
+	/* Flush the write out to device */
+	return vp_ioread16(&cfg->queue_msix_vector);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_vector);
+
+/*
+ * vp_modern_config_vector - set the vector for config interrupt
+ * @mdev: the modern virtio-pci device
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
+			    u16 vector)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	/* Setup the vector used for configuration events */
+	vp_iowrite16(vector, &cfg->msix_config);
+	/* Verify we had enough resources to assign the vector */
+	/* Will also flush the write out to device */
+	return vp_ioread16(&cfg->msix_config);
+}
+EXPORT_SYMBOL_GPL(vp_modern_config_vector);
+
+/*
+ * vp_modern_queue_address - set the virtqueue address
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @desc_addr: address of the descriptor area
+ * @driver_addr: address of the driver area
+ * @device_addr: address of the device area
+ */
+void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+			     u16 index, u64 desc_addr, u64 driver_addr,
+			     u64 device_addr)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite16(index, &cfg->queue_select);
+
+	vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo,
+			     &cfg->queue_desc_hi);
+	vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo,
+			     &cfg->queue_avail_hi);
+	vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo,
+			     &cfg->queue_used_hi);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_address);
+
+/*
+ * vp_modern_set_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @enable: whether the virtqueue is enable or not
+ */
+void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 index, bool enable)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+	vp_iowrite16(enable, &mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_enable);
+
+/*
+ * vp_modern_get_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_enable);
+
+/*
+ * vp_modern_set_queue_size - set size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @size: the size of the virtqueue
+ */
+void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+			      u16 index, u16 size)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+	vp_iowrite16(size, &mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_size);
+
+/*
+ * vp_modern_get_queue_size - get size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+			     u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_size);
+
+/*
+ * vp_modern_get_num_queues - get the number of virtqueues
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the number of virtqueues
+ */
+u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
+{
+	return vp_ioread16(&mdev->common->num_queues);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_num_queues);
+
+/*
+ * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the notification offset for a virtqueue
+ */
+u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+				   u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_notify_off);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off);
+
+MODULE_VERSION("0.1");
+MODULE_DESCRIPTION("Modern Virtio PCI Device");
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
new file mode 100644
index 000000000000..f26acbeec965
--- /dev/null
+++ b/include/linux/virtio_pci_modern.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_MODERN_H
+#define _LINUX_VIRTIO_PCI_MODERN_H
+
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+
+struct virtio_pci_modern_device {
+	struct pci_dev *pci_dev;
+
+	struct virtio_pci_common_cfg __iomem *common;
+	/* Device-specific data (non-legacy mode)  */
+	void __iomem *device;
+	/* Base of vq notifications (non-legacy mode). */
+	void __iomem *notify_base;
+	/* Where to read and clear interrupt */
+	u8 __iomem *isr;
+
+	/* So we can sanity-check accesses. */
+	size_t notify_len;
+	size_t device_len;
+
+	/* Capability for when we need to map notifications per-vq. */
+	int notify_map_cap;
+
+	/* Multiply queue_notify_off by this value. (non-legacy mode). */
+	u32 notify_offset_multiplier;
+
+	int modern_bars;
+
+	struct virtio_device_id id;
+};
+
+/*
+ * Type-safe wrappers for io accesses.
+ * Use these to enforce at compile time the following spec requirement:
+ *
+ * The driver MUST access each field using the “natural” access
+ * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses
+ * for 16-bit fields and 8-bit accesses for 8-bit fields.
+ */
+static inline u8 vp_ioread8(const u8 __iomem *addr)
+{
+	return ioread8(addr);
+}
+static inline u16 vp_ioread16 (const __le16 __iomem *addr)
+{
+	return ioread16(addr);
+}
+
+static inline u32 vp_ioread32(const __le32 __iomem *addr)
+{
+	return ioread32(addr);
+}
+
+static inline void vp_iowrite8(u8 value, u8 __iomem *addr)
+{
+	iowrite8(value, addr);
+}
+
+static inline void vp_iowrite16(u16 value, __le16 __iomem *addr)
+{
+	iowrite16(value, addr);
+}
+
+static inline void vp_iowrite32(u32 value, __le32 __iomem *addr)
+{
+	iowrite32(value, addr);
+}
+
+static inline void vp_iowrite64_twopart(u64 val,
+					__le32 __iomem *lo,
+					__le32 __iomem *hi)
+{
+	vp_iowrite32((u32)val, lo);
+	vp_iowrite32(val >> 32, hi);
+}
+
+u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev);
+void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
+		     u64 features);
+u32 vp_modern_generation(struct virtio_pci_modern_device *mdev);
+u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev);
+void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
+		   u8 status);
+u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+			   u16 idx, u16 vector);
+u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
+		     u16 vector);
+void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+			     u16 index, u64 desc_addr, u64 driver_addr,
+			     u64 device_addr);
+void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 idx, bool enable);
+bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 idx);
+void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+			      u16 idx, u16 size);
+u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+			     u16 idx);
+u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
+u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+				   u16 idx);
+void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+				       size_t minlen,
+				       u32 align,
+				       u32 start, u32 size,
+				       size_t *len);
+int vp_modern_probe(struct virtio_pci_modern_device *mdev);
+void vp_modern_remove(struct virtio_pci_modern_device *mdev);
+#endif
-- 
cgit v1.2.3


From 06f45fe96fcd81531b0bcb2a6115da563ae6dbd6 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 19 Feb 2021 16:40:28 +0100
Subject: xen/events: add per-xenbus device event statistics and settings

Add syfs nodes for each xenbus device showing event statistics (number
of events and spurious events, number of associated event channels)
and for setting a spurious event threshold in case a frontend is
sending too many events without being rogue on purpose.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20210219154030.10892-7-jgross@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 Documentation/ABI/testing/sysfs-devices-xenbus | 41 ++++++++++++++++
 drivers/xen/events/events_base.c               | 27 ++++++++++-
 drivers/xen/xenbus/xenbus_probe.c              | 66 ++++++++++++++++++++++++++
 include/xen/xenbus.h                           |  7 +++
 4 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-xenbus

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-devices-xenbus b/Documentation/ABI/testing/sysfs-devices-xenbus
new file mode 100644
index 000000000000..fd796cb4f315
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-xenbus
@@ -0,0 +1,41 @@
+What:		/sys/devices/*/xenbus/event_channels
+Date:		February 2021
+Contact:	Xen Developers mailing list <xen-devel@lists.xenproject.org>
+Description:
+		Number of Xen event channels associated with a kernel based
+		paravirtualized device frontend or backend.
+
+What:		/sys/devices/*/xenbus/events
+Date:		February 2021
+Contact:	Xen Developers mailing list <xen-devel@lists.xenproject.org>
+Description:
+		Total number of Xen events received for a Xen pv device
+		frontend or backend.
+
+What:		/sys/devices/*/xenbus/jiffies_eoi_delayed
+Date:		February 2021
+Contact:	Xen Developers mailing list <xen-devel@lists.xenproject.org>
+Description:
+		Summed up time in jiffies the EOI of an interrupt for a Xen
+		pv device has been delayed in order to avoid stalls due to
+		event storms. This value rising is a first sign for a rogue
+		other end of the pv device.
+
+What:		/sys/devices/*/xenbus/spurious_events
+Date:		February 2021
+Contact:	Xen Developers mailing list <xen-devel@lists.xenproject.org>
+Description:
+		Number of events received for a Xen pv device which did not
+		require any action. Too many spurious events in a row will
+		trigger delayed EOI processing.
+
+What:		/sys/devices/*/xenbus/spurious_threshold
+Date:		February 2021
+Contact:	Xen Developers mailing list <xen-devel@lists.xenproject.org>
+Description:
+		Controls the tolerated number of subsequent spurious events
+		before delayed EOI processing is triggered for a Xen pv
+		device. Default is 1. This can be modified in case the other
+		end of the pv device is issuing spurious events on a regular
+		basis and is known not to be malicious on purpose. Raising
+		the value for such cases can improve pv device performance.
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b249f2d6b0cc..adb7260e94b2 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -323,6 +323,8 @@ static int xen_irq_info_evtchn_setup(unsigned irq,
 
 	ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
 	info->u.interdomain = dev;
+	if (dev)
+		atomic_inc(&dev->event_channels);
 
 	return ret;
 }
@@ -568,18 +570,28 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious)
 		return;
 
 	if (spurious) {
+		struct xenbus_device *dev = info->u.interdomain;
+		unsigned int threshold = 1;
+
+		if (dev && dev->spurious_threshold)
+			threshold = dev->spurious_threshold;
+
 		if ((1 << info->spurious_cnt) < (HZ << 2)) {
 			if (info->spurious_cnt != 0xFF)
 				info->spurious_cnt++;
 		}
-		if (info->spurious_cnt > 1) {
-			delay = 1 << (info->spurious_cnt - 2);
+		if (info->spurious_cnt > threshold) {
+			delay = 1 << (info->spurious_cnt - 1 - threshold);
 			if (delay > HZ)
 				delay = HZ;
 			if (!info->eoi_time)
 				info->eoi_cpu = smp_processor_id();
 			info->eoi_time = get_jiffies_64() + delay;
+			if (dev)
+				atomic_add(delay, &dev->jiffies_eoi_delayed);
 		}
+		if (dev)
+			atomic_inc(&dev->spurious_events);
 	} else {
 		info->spurious_cnt = 0;
 	}
@@ -908,6 +920,7 @@ static void __unbind_from_irq(unsigned int irq)
 
 	if (VALID_EVTCHN(evtchn)) {
 		unsigned int cpu = cpu_from_irq(irq);
+		struct xenbus_device *dev;
 
 		xen_evtchn_close(evtchn);
 
@@ -918,6 +931,11 @@ static void __unbind_from_irq(unsigned int irq)
 		case IRQT_IPI:
 			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
 			break;
+		case IRQT_EVTCHN:
+			dev = info->u.interdomain;
+			if (dev)
+				atomic_dec(&dev->event_channels);
+			break;
 		default:
 			break;
 		}
@@ -1581,6 +1599,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 {
 	int irq;
 	struct irq_info *info;
+	struct xenbus_device *dev;
 
 	irq = get_evtchn_to_irq(port);
 	if (irq == -1)
@@ -1610,6 +1629,10 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 
 	info = info_for_irq(irq);
 
+	dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL;
+	if (dev)
+		atomic_inc(&dev->events);
+
 	if (ctrl->defer_eoi) {
 		info->eoi_cpu = smp_processor_id();
 		info->irq_epoch = __this_cpu_read(irq_epoch);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 8a75092bb148..97f0d234482d 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -206,6 +206,65 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
 }
 EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
 
+#define XENBUS_SHOW_STAT(name)						\
+static ssize_t show_##name(struct device *_dev,				\
+			   struct device_attribute *attr,		\
+			   char *buf)					\
+{									\
+	struct xenbus_device *dev = to_xenbus_device(_dev);		\
+									\
+	return sprintf(buf, "%d\n", atomic_read(&dev->name));		\
+}									\
+static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+XENBUS_SHOW_STAT(event_channels);
+XENBUS_SHOW_STAT(events);
+XENBUS_SHOW_STAT(spurious_events);
+XENBUS_SHOW_STAT(jiffies_eoi_delayed);
+
+static ssize_t show_spurious_threshold(struct device *_dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+
+	return sprintf(buf, "%d\n", dev->spurious_threshold);
+}
+
+static ssize_t set_spurious_threshold(struct device *_dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	unsigned int val;
+	ssize_t ret;
+
+	ret = kstrtouint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	dev->spurious_threshold = val;
+
+	return count;
+}
+
+static DEVICE_ATTR(spurious_threshold, 0644, show_spurious_threshold,
+		   set_spurious_threshold);
+
+static struct attribute *xenbus_attrs[] = {
+	&dev_attr_event_channels.attr,
+	&dev_attr_events.attr,
+	&dev_attr_spurious_events.attr,
+	&dev_attr_jiffies_eoi_delayed.attr,
+	&dev_attr_spurious_threshold.attr,
+	NULL
+};
+
+static const struct attribute_group xenbus_group = {
+	.name = "xenbus",
+	.attrs = xenbus_attrs,
+};
+
 int xenbus_dev_probe(struct device *_dev)
 {
 	struct xenbus_device *dev = to_xenbus_device(_dev);
@@ -253,6 +312,11 @@ int xenbus_dev_probe(struct device *_dev)
 		return err;
 	}
 
+	dev->spurious_threshold = 1;
+	if (sysfs_create_group(&dev->dev.kobj, &xenbus_group))
+		dev_warn(&dev->dev, "sysfs_create_group on %s failed.\n",
+			 dev->nodename);
+
 	return 0;
 fail_put:
 	module_put(drv->driver.owner);
@@ -269,6 +333,8 @@ int xenbus_dev_remove(struct device *_dev)
 
 	DPRINTK("%s", dev->nodename);
 
+	sysfs_remove_group(&dev->dev.kobj, &xenbus_group);
+
 	free_otherend_watch(dev);
 
 	if (drv->remove) {
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index bf3cfc7c35d0..0b1386073d49 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -88,6 +88,13 @@ struct xenbus_device {
 	struct completion down;
 	struct work_struct work;
 	struct semaphore reclaim_sem;
+
+	/* Event channel based statistics and settings. */
+	atomic_t event_channels;
+	atomic_t events;
+	atomic_t spurious_events;
+	atomic_t jiffies_eoi_delayed;
+	unsigned int spurious_threshold;
 };
 
 static inline struct xenbus_device *to_xenbus_device(struct device *dev)
-- 
cgit v1.2.3


From c0ea57608b691d6cde8aff23e11f9858a86b5918 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 16 Feb 2021 16:52:47 +0100
Subject: blktrace: remove debugfs file dentries from struct blk_trace

These debugfs dentries do not need to be saved for anything as the whole
directory and everything in it is properly cleaned up when the parent
directory is removed.  So remove them from struct blk_trace and don't
save them when created as it's not necessary.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h | 2 --
 kernel/trace/blktrace.c      | 8 ++------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index e17d04abf6a3..a083e15df608 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -23,8 +23,6 @@ struct blk_trace {
 	u32 pid;
 	u32 dev;
 	struct dentry *dir;
-	struct dentry *dropped_file;
-	struct dentry *msg_file;
 	struct list_head running_list;
 	atomic_t dropped;
 };
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d7ebef83771c..0c8690d9f6cd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,8 +311,6 @@ record_it:
 
 static void blk_trace_free(struct blk_trace *bt)
 {
-	debugfs_remove(bt->msg_file);
-	debugfs_remove(bt->dropped_file);
 	relay_close(bt->rchan);
 	debugfs_remove(bt->dir);
 	free_percpu(bt->sequence);
@@ -544,10 +542,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	INIT_LIST_HEAD(&bt->running_list);
 
 	ret = -EIO;
-	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
-					       &blk_dropped_fops);
-
-	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
 
 	bt->rchan = relay_open("trace", dir, buts->buf_size,
 				buts->buf_nr, &blk_relay_callbacks, bt);
-- 
cgit v1.2.3


From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 23 Feb 2021 14:18:58 +0100
Subject: net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending

The icmp{,v6}_send functions make all sorts of use of skb->cb, casting
it with IPCB or IP6CB, assuming the skb to have come directly from the
inet layer. But when the packet comes from the ndo layer, especially
when forwarded, there's no telling what might be in skb->cb at that
point. As a result, the icmp sending code risks reading bogus memory
contents, which can result in nasty stack overflows such as this one
reported by a user:

    panic+0x108/0x2ea
    __stack_chk_fail+0x14/0x20
    __icmp_send+0x5bd/0x5c0
    icmp_ndo_send+0x148/0x160

In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read
from it. The optlen parameter there is of particular note, as it can
induce writes beyond bounds. There are quite a few ways that can happen
in __ip_options_echo. For example:

    // sptr/skb are attacker-controlled skb bytes
    sptr = skb_network_header(skb);
    // dptr/dopt points to stack memory allocated by __icmp_send
    dptr = dopt->__data;
    // sopt is the corrupt skb->cb in question
    if (sopt->rr) {
        optlen  = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data
        soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data
	// this now writes potentially attacker-controlled data, over
	// flowing the stack:
        memcpy(dptr, sptr+sopt->rr, optlen);
    }

In the icmpv6_send case, the story is similar, but not as dire, as only
IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is
worse than the iif case, but it is passed to ipv6_find_tlv, which does
a bit of bounds checking on the value.

This is easy to simulate by doing a `memset(skb->cb, 0x41,
sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by
good fortune and the rarity of icmp sending from that context that we've
avoided reports like this until now. For example, in KASAN:

    BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0
    Write of size 38 at addr ffff888006f1f80e by task ping/89
    CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5
    Call Trace:
     dump_stack+0x9a/0xcc
     print_address_description.constprop.0+0x1a/0x160
     __kasan_report.cold+0x20/0x38
     kasan_report+0x32/0x40
     check_memory_region+0x145/0x1a0
     memcpy+0x39/0x60
     __ip_options_echo+0xa0e/0x12b0
     __icmp_send+0x744/0x1700

Actually, out of the 4 drivers that do this, only gtp zeroed the cb for
the v4 case, while the rest did not. So this commit actually removes the
gtp-specific zeroing, while putting the code where it belongs in the
shared infrastructure of icmp{,v6}_ndo_send.

This commit fixes the issue by passing an empty IPCB or IP6CB along to
the functions that actually do the work. For the icmp_send, this was
already trivial, thanks to __icmp_send providing the plumbing function.
For icmpv6_send, this required a tiny bit of refactoring to make it
behave like the v4 case, after which it was straight forward.

Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs")
Reported-by: SinYu <liuxyon@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/gtp.c      |  1 -
 include/linux/icmpv6.h | 26 ++++++++++++++++++++------
 include/linux/ipv6.h   |  1 -
 include/net/icmp.h     |  6 +++++-
 net/ipv4/icmp.c        |  5 +++--
 net/ipv6/icmp.c        | 18 +++++++++---------
 net/ipv6/ip6_icmp.c    | 12 +++++++-----
 7 files changed, 44 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 9a70f05baf6e..39c00f050fbd 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -543,7 +543,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) &&
 	    mtu < ntohs(iph->tot_len)) {
 		netdev_dbg(dev, "packet too big, fragmentation needed\n");
-		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 		icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			      htonl(mtu));
 		goto err_rt;
diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 452d8978ffc7..9055cb380ee2 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -3,6 +3,7 @@
 #define _LINUX_ICMPV6_H
 
 #include <linux/skbuff.h>
+#include <linux/ipv6.h>
 #include <uapi/linux/icmpv6.h>
 
 static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
@@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_IPV6)
 
 typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-			     const struct in6_addr *force_saddr);
+			     const struct in6_addr *force_saddr,
+			     const struct inet6_skb_parm *parm);
 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-		const struct in6_addr *force_saddr);
+		const struct in6_addr *force_saddr,
+		const struct inet6_skb_parm *parm);
 #if IS_BUILTIN(CONFIG_IPV6)
-static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+				 const struct inet6_skb_parm *parm)
 {
-	icmp6_send(skb, type, code, info, NULL);
+	icmp6_send(skb, type, code, info, NULL, parm);
 }
 static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
 {
@@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
 	return 0;
 }
 #else
-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
+extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+			  const struct inet6_skb_parm *parm);
 extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
 extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
 #endif
 
+static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+	__icmpv6_send(skb, type, code, info, IP6CB(skb));
+}
+
 int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 			       unsigned int data_len);
 
 #if IS_ENABLED(CONFIG_NF_NAT)
 void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
 #else
-#define icmpv6_ndo_send icmpv6_send
+static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+	struct inet6_skb_parm parm = { 0 };
+	__icmpv6_send(skb_in, type, code, info, &parm);
+}
 #endif
 
 #else
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9d1f29f0c512..70b2ad3b9884 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -85,7 +85,6 @@ struct ipv6_params {
 	__s32 autoconf;
 };
 extern struct ipv6_params ipv6_defaults;
-#include <linux/icmpv6.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 9ac2d2672a93..fd84adc47963 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32
 #if IS_ENABLED(CONFIG_NF_NAT)
 void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
 #else
-#define icmp_ndo_send icmp_send
+static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+	struct ip_options opts = { 0 };
+	__icmp_send(skb_in, type, code, info, &opts);
+}
 #endif
 
 int icmp_rcv(struct sk_buff *skb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 396b492c804f..616e2dc1c8fa 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send);
 void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 {
 	struct sk_buff *cloned_skb = NULL;
+	struct ip_options opts = { 0 };
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 	__be32 orig_ip;
 
 	ct = nf_ct_get(skb_in, &ctinfo);
 	if (!ct || !(ct->status & IPS_SRC_NAT)) {
-		icmp_send(skb_in, type, code, info);
+		__icmp_send(skb_in, type, code, info, &opts);
 		return;
 	}
 
@@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 	orig_ip = ip_hdr(skb_in)->saddr;
 	ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
-	icmp_send(skb_in, type, code, info);
+	__icmp_send(skb_in, type, code, info, &opts);
 	ip_hdr(skb_in)->saddr = orig_ip;
 out:
 	consume_skb(cloned_skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index f3d05866692e..fd1f896115c1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
 }
 
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
-static void mip6_addr_swap(struct sk_buff *skb)
+static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct ipv6_destopt_hao *hao;
 	struct in6_addr tmp;
 	int off;
@@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buff *skb)
 	}
 }
 #else
-static inline void mip6_addr_swap(struct sk_buff *skb) {}
+static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
 #endif
 
 static struct dst_entry *icmpv6_route_lookup(struct net *net,
@@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buff *skb)
  *	Send an ICMP message in response to a packet in error
  */
 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-		const struct in6_addr *force_saddr)
+		const struct in6_addr *force_saddr,
+		const struct inet6_skb_parm *parm)
 {
 	struct inet6_dev *idev = NULL;
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
 		goto out_bh_enable;
 
-	mip6_addr_swap(skb);
+	mip6_addr_swap(skb, parm);
 
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
@@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		/* select a more meaningful saddr from input if */
 		struct net_device *in_netdev;
 
-		in_netdev = dev_get_by_index(net, IP6CB(skb)->iif);
+		in_netdev = dev_get_by_index(net, parm->iif);
 		if (in_netdev) {
 			ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
 					   inet6_sk(sk)->srcprefs,
@@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send);
  */
 void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
-	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
+	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
 	kfree_skb(skb);
 }
 
@@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 	}
 	if (type == ICMP_TIME_EXCEEDED)
 		icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
-			   info, &temp_saddr);
+			   info, &temp_saddr, IP6CB(skb2));
 	else
 		icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
-			   info, &temp_saddr);
+			   info, &temp_saddr, IP6CB(skb2));
 	if (rt)
 		ip6_rt_put(rt);
 
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 70c8c2f36c98..9e3574880cb0 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
 }
 EXPORT_SYMBOL(inet6_unregister_icmp_sender);
 
-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+		   const struct inet6_skb_parm *parm)
 {
 	ip6_icmp_send_t *send;
 
 	rcu_read_lock();
 	send = rcu_dereference(ip6_icmp_send);
 	if (send)
-		send(skb, type, code, info, NULL);
+		send(skb, type, code, info, NULL, parm);
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(icmpv6_send);
+EXPORT_SYMBOL(__icmpv6_send);
 #endif
 
 #if IS_ENABLED(CONFIG_NF_NAT)
 #include <net/netfilter/nf_conntrack.h>
 void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 {
+	struct inet6_skb_parm parm = { 0 };
 	struct sk_buff *cloned_skb = NULL;
 	enum ip_conntrack_info ctinfo;
 	struct in6_addr orig_ip;
@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 
 	ct = nf_ct_get(skb_in, &ctinfo);
 	if (!ct || !(ct->status & IPS_SRC_NAT)) {
-		icmpv6_send(skb_in, type, code, info);
+		__icmpv6_send(skb_in, type, code, info, &parm);
 		return;
 	}
 
@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 
 	orig_ip = ipv6_hdr(skb_in)->saddr;
 	ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
-	icmpv6_send(skb_in, type, code, info);
+	__icmpv6_send(skb_in, type, code, info, &parm);
 	ipv6_hdr(skb_in)->saddr = orig_ip;
 out:
 	consume_skb(cloned_skb);
-- 
cgit v1.2.3


From fa8fef0e104a23efe568b835d9e7e188d1d97610 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:55 +0530
Subject: PCI: endpoint: Add helper API to get the 'next' unreserved BAR

Add an API to get the next unreserved BAR starting from a given BAR number
that can be used by the endpoint function.

Link: https://lore.kernel.org/r/20210201195809.7342-4-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 26 ++++++++++++++++++++++----
 include/linux/pci-epc.h             |  2 ++
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 25e57672e1a1..1afe5d9afb0d 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -87,17 +87,36 @@ EXPORT_SYMBOL_GPL(pci_epc_get);
  * pci_epc_get_first_free_bar() - helper to get first unreserved BAR
  * @epc_features: pci_epc_features structure that holds the reserved bar bitmap
  *
- * Invoke to get the first unreserved BAR that can be used for endpoint
+ * Invoke to get the first unreserved BAR that can be used by the endpoint
  * function. For any incorrect value in reserved_bar return '0'.
  */
 unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 					*epc_features)
+{
+	return pci_epc_get_next_free_bar(epc_features, BAR_0);
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
+
+/**
+ * pci_epc_get_next_free_bar() - helper to get unreserved BAR starting from @bar
+ * @epc_features: pci_epc_features structure that holds the reserved bar bitmap
+ * @bar: the starting BAR number from where unreserved BAR should be searched
+ *
+ * Invoke to get the next unreserved BAR starting from @bar that can be used
+ * for endpoint function. For any incorrect value in reserved_bar return '0'.
+ */
+unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
+				       *epc_features, enum pci_barno bar)
 {
 	unsigned long free_bar;
 
 	if (!epc_features)
 		return 0;
 
+	/* If 'bar - 1' is a 64-bit BAR, move to the next BAR */
+	if ((epc_features->bar_fixed_64bit << 1) & 1 << bar)
+		bar++;
+
 	/* Find if the reserved BAR is also a 64-bit BAR */
 	free_bar = epc_features->reserved_bar & epc_features->bar_fixed_64bit;
 
@@ -105,14 +124,13 @@ unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 	free_bar <<= 1;
 	free_bar |= epc_features->reserved_bar;
 
-	/* Now find the free BAR */
-	free_bar = ffz(free_bar);
+	free_bar = find_next_zero_bit(&free_bar, 6, bar);
 	if (free_bar > 5)
 		return 0;
 
 	return free_bar;
 }
-EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
+EXPORT_SYMBOL_GPL(pci_epc_get_next_free_bar);
 
 /**
  * pci_epc_get_features() - get the features supported by EPC
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index cc66bec8be90..cfe9b427e6b7 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -203,6 +203,8 @@ const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
 						    u8 func_no);
 unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 					*epc_features);
+unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
+				       *epc_features, enum pci_barno bar);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
-- 
cgit v1.2.3


From 0e27aeccfa3d1bab7c6a29fb8e6fcedbad7b09a8 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:56 +0530
Subject: PCI: endpoint: Make *_free_bar() to return error codes on failure

Modify pci_epc_get_next_free_bar() and pci_epc_get_first_free_bar() to
return error values if there are no free BARs available.

Link: https://lore.kernel.org/r/20210201195809.7342-5-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c |  2 ++
 drivers/pci/endpoint/pci-epc-core.c           | 12 ++++++------
 include/linux/pci-epc.h                       |  8 ++++----
 include/linux/pci-epf.h                       |  1 +
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index e4e51d884553..7a1f3abfde48 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -834,6 +834,8 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		linkup_notifier = epc_features->linkup_notifier;
 		core_init_notifier = epc_features->core_init_notifier;
 		test_reg_bar = pci_epc_get_first_free_bar(epc_features);
+		if (test_reg_bar < 0)
+			return -EINVAL;
 		pci_epf_configure_bar(epf, epc_features);
 	}
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 1afe5d9afb0d..ea7e7465ce7a 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -90,8 +90,8 @@ EXPORT_SYMBOL_GPL(pci_epc_get);
  * Invoke to get the first unreserved BAR that can be used by the endpoint
  * function. For any incorrect value in reserved_bar return '0'.
  */
-unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
-					*epc_features)
+enum pci_barno
+pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features)
 {
 	return pci_epc_get_next_free_bar(epc_features, BAR_0);
 }
@@ -105,13 +105,13 @@ EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
  * Invoke to get the next unreserved BAR starting from @bar that can be used
  * for endpoint function. For any incorrect value in reserved_bar return '0'.
  */
-unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
-				       *epc_features, enum pci_barno bar)
+enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features
+					 *epc_features, enum pci_barno bar)
 {
 	unsigned long free_bar;
 
 	if (!epc_features)
-		return 0;
+		return BAR_0;
 
 	/* If 'bar - 1' is a 64-bit BAR, move to the next BAR */
 	if ((epc_features->bar_fixed_64bit << 1) & 1 << bar)
@@ -126,7 +126,7 @@ unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
 
 	free_bar = find_next_zero_bit(&free_bar, 6, bar);
 	if (free_bar > 5)
-		return 0;
+		return NO_BAR;
 
 	return free_bar;
 }
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index cfe9b427e6b7..88d311bad984 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -201,10 +201,10 @@ int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
 						    u8 func_no);
-unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
-					*epc_features);
-unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
-				       *epc_features, enum pci_barno bar);
+enum pci_barno
+pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features);
+enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features
+					 *epc_features, enum pci_barno bar);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 6644ff3b0702..fa3aca43eb19 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -21,6 +21,7 @@ enum pci_notify_event {
 };
 
 enum pci_barno {
+	NO_BAR = -1,
 	BAR_0,
 	BAR_1,
 	BAR_2,
-- 
cgit v1.2.3


From 7e5a51ebb321537c4209cdd0c54c4c19b3ef960d Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:57 +0530
Subject: PCI: endpoint: Remove unused pci_epf_match_device()

Remove unused pci_epf_match_device() function added in pci-epf-core.c

Link: https://lore.kernel.org/r/20210201195809.7342-6-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 16 ----------------
 include/linux/pci-epf.h             |  2 --
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index c977cf9dce56..e44a317a2a2a 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -282,22 +282,6 @@ struct pci_epf *pci_epf_create(const char *name)
 }
 EXPORT_SYMBOL_GPL(pci_epf_create);
 
-const struct pci_epf_device_id *
-pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf)
-{
-	if (!id || !epf)
-		return NULL;
-
-	while (*id->name) {
-		if (strcmp(epf->name, id->name) == 0)
-			return id;
-		id++;
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(pci_epf_match_device);
-
 static void pci_epf_dev_release(struct device *dev)
 {
 	struct pci_epf *epf = to_pci_epf(dev);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index fa3aca43eb19..f373a134ac04 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -165,8 +165,6 @@ static inline void *epf_get_drvdata(struct pci_epf *epf)
 	return dev_get_drvdata(&epf->dev);
 }
 
-const struct pci_epf_device_id *
-pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf);
 struct pci_epf *pci_epf_create(const char *name);
 void pci_epf_destroy(struct pci_epf *epf);
 int __pci_epf_register_driver(struct pci_epf_driver *driver,
-- 
cgit v1.2.3


From 63840ff5322373d665b2b9c59cd64233d5f0691e Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:58 +0530
Subject: PCI: endpoint: Add support to associate secondary EPC with EPF

In the case of standard endpoint functions, only one endpoint controller
(EPC) will be associated with an endpoint function (EPF). However for
providing NTB (non transparent bridge) functionality, two EPCs should be
associated with a single EPF.  Add support to associate secondary EPC with
EPF. This is in preparation for adding NTB endpoint function driver.

Link: https://lore.kernel.org/r/20210201195809.7342-7-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 11 ++++--
 drivers/pci/endpoint/pci-ep-cfs.c             |  6 +--
 drivers/pci/endpoint/pci-epc-core.c           | 47 ++++++++++++++++------
 drivers/pci/endpoint/pci-epf-core.c           | 57 +++++++++++++++++++--------
 include/linux/pci-epc.h                       | 25 +++++++++++-
 include/linux/pci-epf.h                       | 17 +++++++-
 6 files changed, 125 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 7a1f3abfde48..c0ac4e9cbe72 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -619,7 +619,8 @@ static void pci_epf_test_unbind(struct pci_epf *epf)
 
 		if (epf_test->reg[bar]) {
 			pci_epc_clear_bar(epc, epf->func_no, epf_bar);
-			pci_epf_free_space(epf, epf_test->reg[bar], bar);
+			pci_epf_free_space(epf, epf_test->reg[bar], bar,
+					   PRIMARY_INTERFACE);
 		}
 	}
 }
@@ -651,7 +652,8 @@ static int pci_epf_test_set_bar(struct pci_epf *epf)
 
 		ret = pci_epc_set_bar(epc, epf->func_no, epf_bar);
 		if (ret) {
-			pci_epf_free_space(epf, epf_test->reg[bar], bar);
+			pci_epf_free_space(epf, epf_test->reg[bar], bar,
+					   PRIMARY_INTERFACE);
 			dev_err(dev, "Failed to set BAR%d\n", bar);
 			if (bar == test_reg_bar)
 				return ret;
@@ -771,7 +773,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 	}
 
 	base = pci_epf_alloc_space(epf, test_reg_size, test_reg_bar,
-				   epc_features->align);
+				   epc_features->align, PRIMARY_INTERFACE);
 	if (!base) {
 		dev_err(dev, "Failed to allocated register space\n");
 		return -ENOMEM;
@@ -789,7 +791,8 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 			continue;
 
 		base = pci_epf_alloc_space(epf, bar_size[bar], bar,
-					   epc_features->align);
+					   epc_features->align,
+					   PRIMARY_INTERFACE);
 		if (!base)
 			dev_err(dev, "Failed to allocate space for BAR%d\n",
 				bar);
diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 3710adf51912..6ca9e2f92460 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -94,13 +94,13 @@ static int pci_epc_epf_link(struct config_item *epc_item,
 	struct pci_epc *epc = epc_group->epc;
 	struct pci_epf *epf = epf_group->epf;
 
-	ret = pci_epc_add_epf(epc, epf);
+	ret = pci_epc_add_epf(epc, epf, PRIMARY_INTERFACE);
 	if (ret)
 		return ret;
 
 	ret = pci_epf_bind(epf);
 	if (ret) {
-		pci_epc_remove_epf(epc, epf);
+		pci_epc_remove_epf(epc, epf, PRIMARY_INTERFACE);
 		return ret;
 	}
 
@@ -120,7 +120,7 @@ static void pci_epc_epf_unlink(struct config_item *epc_item,
 	epc = epc_group->epc;
 	epf = epf_group->epf;
 	pci_epf_unbind(epf);
-	pci_epc_remove_epf(epc, epf);
+	pci_epc_remove_epf(epc, epf, PRIMARY_INTERFACE);
 }
 
 static struct configfs_item_operations pci_epc_item_ops = {
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index ea7e7465ce7a..3693eca5b030 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -493,21 +493,28 @@ EXPORT_SYMBOL_GPL(pci_epc_write_header);
  * pci_epc_add_epf() - bind PCI endpoint function to an endpoint controller
  * @epc: the EPC device to which the endpoint function should be added
  * @epf: the endpoint function to be added
+ * @type: Identifies if the EPC is connected to the primary or secondary
+ *        interface of EPF
  *
  * A PCI endpoint device can have one or more functions. In the case of PCIe,
  * the specification allows up to 8 PCIe endpoint functions. Invoke
  * pci_epc_add_epf() to add a PCI endpoint function to an endpoint controller.
  */
-int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf)
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf,
+		    enum pci_epc_interface_type type)
 {
+	struct list_head *list;
 	u32 func_no;
 	int ret = 0;
 
-	if (epf->epc)
+	if (IS_ERR_OR_NULL(epc))
+		return -EINVAL;
+
+	if (type == PRIMARY_INTERFACE && epf->epc)
 		return -EBUSY;
 
-	if (IS_ERR(epc))
-		return -EINVAL;
+	if (type == SECONDARY_INTERFACE && epf->sec_epc)
+		return -EBUSY;
 
 	mutex_lock(&epc->lock);
 	func_no = find_first_zero_bit(&epc->function_num_map,
@@ -524,11 +531,17 @@ int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf)
 	}
 
 	set_bit(func_no, &epc->function_num_map);
-	epf->func_no = func_no;
-	epf->epc = epc;
-
-	list_add_tail(&epf->list, &epc->pci_epf);
+	if (type == PRIMARY_INTERFACE) {
+		epf->func_no = func_no;
+		epf->epc = epc;
+		list = &epf->list;
+	} else {
+		epf->sec_epc_func_no = func_no;
+		epf->sec_epc = epc;
+		list = &epf->sec_epc_list;
+	}
 
+	list_add_tail(list, &epc->pci_epf);
 ret:
 	mutex_unlock(&epc->lock);
 
@@ -543,14 +556,26 @@ EXPORT_SYMBOL_GPL(pci_epc_add_epf);
  *
  * Invoke to remove PCI endpoint function from the endpoint controller.
  */
-void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf)
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
+			enum pci_epc_interface_type type)
 {
+	struct list_head *list;
+	u32 func_no = 0;
+
 	if (!epc || IS_ERR(epc) || !epf)
 		return;
 
+	if (type == PRIMARY_INTERFACE) {
+		func_no = epf->func_no;
+		list = &epf->list;
+	} else {
+		func_no = epf->sec_epc_func_no;
+		list = &epf->sec_epc_list;
+	}
+
 	mutex_lock(&epc->lock);
-	clear_bit(epf->func_no, &epc->function_num_map);
-	list_del(&epf->list);
+	clear_bit(func_no, &epc->function_num_map);
+	list_del(list);
 	epf->epc = NULL;
 	mutex_unlock(&epc->lock);
 }
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index e44a317a2a2a..79329ec6373c 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -74,24 +74,37 @@ EXPORT_SYMBOL_GPL(pci_epf_bind);
  * @epf: the EPF device from whom to free the memory
  * @addr: the virtual address of the PCI EPF register space
  * @bar: the BAR number corresponding to the register space
+ * @type: Identifies if the allocated space is for primary EPC or secondary EPC
  *
  * Invoke to free the allocated PCI EPF register space.
  */
-void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar)
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
+			enum pci_epc_interface_type type)
 {
 	struct device *dev = epf->epc->dev.parent;
+	struct pci_epf_bar *epf_bar;
+	struct pci_epc *epc;
 
 	if (!addr)
 		return;
 
-	dma_free_coherent(dev, epf->bar[bar].size, addr,
-			  epf->bar[bar].phys_addr);
+	if (type == PRIMARY_INTERFACE) {
+		epc = epf->epc;
+		epf_bar = epf->bar;
+	} else {
+		epc = epf->sec_epc;
+		epf_bar = epf->sec_epc_bar;
+	}
+
+	dev = epc->dev.parent;
+	dma_free_coherent(dev, epf_bar[bar].size, addr,
+			  epf_bar[bar].phys_addr);
 
-	epf->bar[bar].phys_addr = 0;
-	epf->bar[bar].addr = NULL;
-	epf->bar[bar].size = 0;
-	epf->bar[bar].barno = 0;
-	epf->bar[bar].flags = 0;
+	epf_bar[bar].phys_addr = 0;
+	epf_bar[bar].addr = NULL;
+	epf_bar[bar].size = 0;
+	epf_bar[bar].barno = 0;
+	epf_bar[bar].flags = 0;
 }
 EXPORT_SYMBOL_GPL(pci_epf_free_space);
 
@@ -101,15 +114,18 @@ EXPORT_SYMBOL_GPL(pci_epf_free_space);
  * @size: the size of the memory that has to be allocated
  * @bar: the BAR number corresponding to the allocated register space
  * @align: alignment size for the allocation region
+ * @type: Identifies if the allocation is for primary EPC or secondary EPC
  *
  * Invoke to allocate memory for the PCI EPF register space.
  */
 void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
-			  size_t align)
+			  size_t align, enum pci_epc_interface_type type)
 {
-	void *space;
-	struct device *dev = epf->epc->dev.parent;
+	struct pci_epf_bar *epf_bar;
 	dma_addr_t phys_addr;
+	struct pci_epc *epc;
+	struct device *dev;
+	void *space;
 
 	if (size < 128)
 		size = 128;
@@ -119,17 +135,26 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 	else
 		size = roundup_pow_of_two(size);
 
+	if (type == PRIMARY_INTERFACE) {
+		epc = epf->epc;
+		epf_bar = epf->bar;
+	} else {
+		epc = epf->sec_epc;
+		epf_bar = epf->sec_epc_bar;
+	}
+
+	dev = epc->dev.parent;
 	space = dma_alloc_coherent(dev, size, &phys_addr, GFP_KERNEL);
 	if (!space) {
 		dev_err(dev, "failed to allocate mem space\n");
 		return NULL;
 	}
 
-	epf->bar[bar].phys_addr = phys_addr;
-	epf->bar[bar].addr = space;
-	epf->bar[bar].size = size;
-	epf->bar[bar].barno = bar;
-	epf->bar[bar].flags |= upper_32_bits(size) ?
+	epf_bar[bar].phys_addr = phys_addr;
+	epf_bar[bar].addr = space;
+	epf_bar[bar].size = size;
+	epf_bar[bar].barno = bar;
+	epf_bar[bar].flags |= upper_32_bits(size) ?
 				PCI_BASE_ADDRESS_MEM_TYPE_64 :
 				PCI_BASE_ADDRESS_MEM_TYPE_32;
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 88d311bad984..d9cb3944fb87 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -13,6 +13,12 @@
 
 struct pci_epc;
 
+enum pci_epc_interface_type {
+	UNKNOWN_INTERFACE = -1,
+	PRIMARY_INTERFACE,
+	SECONDARY_INTERFACE,
+};
+
 enum pci_epc_irq_type {
 	PCI_EPC_IRQ_UNKNOWN,
 	PCI_EPC_IRQ_LEGACY,
@@ -20,6 +26,19 @@ enum pci_epc_irq_type {
 	PCI_EPC_IRQ_MSIX,
 };
 
+static inline const char *
+pci_epc_interface_string(enum pci_epc_interface_type type)
+{
+	switch (type) {
+	case PRIMARY_INTERFACE:
+		return "primary";
+	case SECONDARY_INTERFACE:
+		return "secondary";
+	default:
+		return "UNKNOWN interface";
+	}
+}
+
 /**
  * struct pci_epc_ops - set of function pointers for performing EPC operations
  * @write_header: ops to populate configuration space header
@@ -175,10 +194,12 @@ __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
 		 struct module *owner);
 void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc);
 void pci_epc_destroy(struct pci_epc *epc);
-int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf);
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf,
+		    enum pci_epc_interface_type type);
 void pci_epc_linkup(struct pci_epc *epc);
 void pci_epc_init_notify(struct pci_epc *epc);
-void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf);
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
+			enum pci_epc_interface_type type);
 int pci_epc_write_header(struct pci_epc *epc, u8 func_no,
 			 struct pci_epf_header *hdr);
 int pci_epc_set_bar(struct pci_epc *epc, u8 func_no,
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index f373a134ac04..1dc66824f5a8 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -14,6 +14,7 @@
 #include <linux/pci.h>
 
 struct pci_epf;
+enum pci_epc_interface_type;
 
 enum pci_notify_event {
 	CORE_INIT,
@@ -119,6 +120,11 @@ struct pci_epf_bar {
  * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc
  * @nb: notifier block to notify EPF of any EPC events (like linkup)
  * @lock: mutex to protect pci_epf_ops
+ * @sec_epc: the secondary EPC device to which this EPF device is bound
+ * @sec_epc_list: to add pci_epf as list of PCI endpoint functions to secondary
+ *   EPC device
+ * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC
+ * @sec_epc_func_no: unique (physical) function number within the secondary EPC
  */
 struct pci_epf {
 	struct device		dev;
@@ -135,6 +141,12 @@ struct pci_epf {
 	struct notifier_block   nb;
 	/* mutex to protect against concurrent access of pci_epf_ops */
 	struct mutex		lock;
+
+	/* Below members are to attach secondary EPC to an endpoint function */
+	struct pci_epc		*sec_epc;
+	struct list_head	sec_epc_list;
+	struct pci_epf_bar	sec_epc_bar[6];
+	u8			sec_epc_func_no;
 };
 
 /**
@@ -171,8 +183,9 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver,
 			      struct module *owner);
 void pci_epf_unregister_driver(struct pci_epf_driver *driver);
 void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
-			  size_t align);
-void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar);
+			  size_t align, enum pci_epc_interface_type type);
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
+			enum pci_epc_interface_type type);
 int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
 #endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From 87d5972e476f6c4e98a0abce713c54c6f40661b0 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:00 +0530
Subject: PCI: endpoint: Add pci_epc_ops to map MSI IRQ

Add pci_epc_ops to map physical address to MSI address and return MSI data.
The physical address is an address in the outbound region. This is required
to implement doorbell functionality of NTB (non-transparent bridge) wherein
EPC on either side of the interface (primary and secondary) can directly
write to the physical address (in outbound region) of the other interface
to ring doorbell using MSI.

Link: https://lore.kernel.org/r/20210201195809.7342-9-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 41 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-epc.h             |  8 ++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 3693eca5b030..cc8f9eb2b177 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -230,6 +230,47 @@ int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 }
 EXPORT_SYMBOL_GPL(pci_epc_raise_irq);
 
+/**
+ * pci_epc_map_msi_irq() - Map physical address to MSI address and return
+ *                         MSI data
+ * @epc: the EPC device which has the MSI capability
+ * @func_no: the physical endpoint function number in the EPC device
+ * @phys_addr: the physical address of the outbound region
+ * @interrupt_num: the MSI interrupt number
+ * @entry_size: Size of Outbound address region for each interrupt
+ * @msi_data: the data that should be written in order to raise MSI interrupt
+ *            with interrupt number as 'interrupt num'
+ * @msi_addr_offset: Offset of MSI address from the aligned outbound address
+ *                   to which the MSI address is mapped
+ *
+ * Invoke to map physical address to MSI address and return MSI data. The
+ * physical address should be an address in the outbound region. This is
+ * required to implement doorbell functionality of NTB wherein EPC on either
+ * side of the interface (primary and secondary) can directly write to the
+ * physical address (in outbound region) of the other interface to ring
+ * doorbell.
+ */
+int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, phys_addr_t phys_addr,
+			u8 interrupt_num, u32 entry_size, u32 *msi_data,
+			u32 *msi_addr_offset)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(epc))
+		return -EINVAL;
+
+	if (!epc->ops->map_msi_irq)
+		return -EINVAL;
+
+	mutex_lock(&epc->lock);
+	ret = epc->ops->map_msi_irq(epc, func_no, phys_addr, interrupt_num,
+				    entry_size, msi_data, msi_addr_offset);
+	mutex_unlock(&epc->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_map_msi_irq);
+
 /**
  * pci_epc_get_msi() - get the number of MSI interrupt numbers allocated
  * @epc: the EPC device to which MSI interrupts was requested
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index d9cb3944fb87..b82c9b100e97 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -55,6 +55,7 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
  * @get_msix: ops to get the number of MSI-X interrupts allocated by the RC
  *	     from the MSI-X capability register
  * @raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
+ * @map_msi_irq: ops to map physical address to MSI address and return MSI data
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
  * @owner: the module owner containing the ops
@@ -77,6 +78,10 @@ struct pci_epc_ops {
 	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
 			     enum pci_epc_irq_type type, u16 interrupt_num);
+	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no,
+			       phys_addr_t phys_addr, u8 interrupt_num,
+			       u32 entry_size, u32 *msi_data,
+			       u32 *msi_addr_offset);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
 	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
@@ -216,6 +221,9 @@ int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
 int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts,
 		     enum pci_barno, u32 offset);
 int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
+int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no,
+			phys_addr_t phys_addr, u8 interrupt_num,
+			u32 entry_size, u32 *msi_data, u32 *msi_addr_offset);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 		      enum pci_epc_irq_type type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
-- 
cgit v1.2.3


From 256ae475201b16fd69e00dd6c2d14035e4ea5745 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:01 +0530
Subject: PCI: endpoint: Add pci_epf_ops to expose function-specific attrs

In addition to the attributes that are generic across function drivers
documented in Documentation/PCI/endpoint/pci-endpoint-cfs.rst, there could
be function-specific attributes that has to be exposed by the function
driver to be configured by the user. Add ->add_cfs() in pci_epf_ops to be
populated by the function driver if it has to expose any function-specific
attributes and pci_epf_type_add_cfs() to be invoked by pci-ep-cfs.c when
sub-directory to main function directory is created.

Link: https://lore.kernel.org/r/20210201195809.7342-10-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/pci-epf.h             |  5 +++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index 79329ec6373c..7646c8660d42 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -20,6 +20,38 @@ static DEFINE_MUTEX(pci_epf_mutex);
 static struct bus_type pci_epf_bus_type;
 static const struct device_type pci_epf_type;
 
+/**
+ * pci_epf_type_add_cfs() - Help function drivers to expose function specific
+ *                          attributes in configfs
+ * @epf: the EPF device that has to be configured using configfs
+ * @group: the parent configfs group (corresponding to entries in
+ *         pci_epf_device_id)
+ *
+ * Invoke to expose function specific attributes in configfs. If the function
+ * driver does not have anything to expose (attributes configured by user),
+ * return NULL.
+ */
+struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf,
+					  struct config_group *group)
+{
+	struct config_group *epf_type_group;
+
+	if (!epf->driver) {
+		dev_err(&epf->dev, "epf device not bound to driver\n");
+		return NULL;
+	}
+
+	if (!epf->driver->ops->add_cfs)
+		return NULL;
+
+	mutex_lock(&epf->lock);
+	epf_type_group = epf->driver->ops->add_cfs(epf, group);
+	mutex_unlock(&epf->lock);
+
+	return epf_type_group;
+}
+EXPORT_SYMBOL_GPL(pci_epf_type_add_cfs);
+
 /**
  * pci_epf_unbind() - Notify the function driver that the binding between the
  *		      EPF device and EPC device has been lost
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 1dc66824f5a8..b241e7dd171f 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -62,10 +62,13 @@ struct pci_epf_header {
  * @bind: ops to perform when a EPC device has been bound to EPF device
  * @unbind: ops to perform when a binding has been lost between a EPC device
  *	    and EPF device
+ * @add_cfs: ops to initialize function specific configfs attributes
  */
 struct pci_epf_ops {
 	int	(*bind)(struct pci_epf *epf);
 	void	(*unbind)(struct pci_epf *epf);
+	struct config_group *(*add_cfs)(struct pci_epf *epf,
+					struct config_group *group);
 };
 
 /**
@@ -188,4 +191,6 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
 			enum pci_epc_interface_type type);
 int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
+struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf,
+					  struct config_group *group);
 #endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From 38ad827e3bc0f0e94628ee1d8dc31e778d9be40f Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:02 +0530
Subject: PCI: endpoint: Allow user to create sub-directory of 'EPF Device'
 directory

Documentation/PCI/endpoint/pci-endpoint-cfs.rst explains how a user has to
create a directory in-order to create a 'EPF Device' that can be
configured/probed by 'EPF Driver'.

Allow user to create a sub-directory of 'EPF Device' directory for any
function specific attributes that has to be exposed to the user.

Link: https://lore.kernel.org/r/20210201195809.7342-11-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-ep-cfs.c | 23 +++++++++++++++++++++++
 include/linux/pci-epf.h           |  3 +++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 8f750961d6ab..f3a8b833b479 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -490,7 +490,29 @@ static struct configfs_item_operations pci_epf_ops = {
 	.release		= pci_epf_release,
 };
 
+static struct config_group *pci_epf_type_make(struct config_group *group,
+					      const char *name)
+{
+	struct pci_epf_group *epf_group = to_pci_epf_group(&group->cg_item);
+	struct config_group *epf_type_group;
+
+	epf_type_group = pci_epf_type_add_cfs(epf_group->epf, group);
+	return epf_type_group;
+}
+
+static void pci_epf_type_drop(struct config_group *group,
+			      struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations pci_epf_type_group_ops = {
+	.make_group     = &pci_epf_type_make,
+	.drop_item      = &pci_epf_type_drop,
+};
+
 static const struct config_item_type pci_epf_type = {
+	.ct_group_ops	= &pci_epf_type_group_ops,
 	.ct_item_ops	= &pci_epf_ops,
 	.ct_attrs	= pci_epf_attrs,
 	.ct_owner	= THIS_MODULE,
@@ -553,6 +575,7 @@ static struct config_group *pci_epf_make(struct config_group *group,
 		goto free_name;
 	}
 
+	epf->group = &epf_group->group;
 	epf_group->epf = epf;
 
 	kfree(epf_name);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index b241e7dd171f..6833e2160ef1 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -9,6 +9,7 @@
 #ifndef __LINUX_PCI_EPF_H
 #define __LINUX_PCI_EPF_H
 
+#include <linux/configfs.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/pci.h>
@@ -128,6 +129,7 @@ struct pci_epf_bar {
  *   EPC device
  * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC
  * @sec_epc_func_no: unique (physical) function number within the secondary EPC
+ * @group: configfs group associated with the EPF device
  */
 struct pci_epf {
 	struct device		dev;
@@ -150,6 +152,7 @@ struct pci_epf {
 	struct list_head	sec_epc_list;
 	struct pci_epf_bar	sec_epc_bar[6];
 	u8			sec_epc_func_no;
+	struct config_group	*group;
 };
 
 /**
-- 
cgit v1.2.3


From 599f86872f9ce8a0a0bd111a23442b18e8ee7059 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:06 +0530
Subject: PCI: Add TI J721E device to PCI IDs

Add TI J721E device to the PCI ID database. Since this device has a
configurable PCIe endpoint, it could be used with different drivers.

Link: https://lore.kernel.org/r/20210201195809.7342-15-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/misc/pci_endpoint_test.c | 1 -
 include/linux/pci_ids.h          | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index eff481ce08ee..1b2868ca4f2a 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -68,7 +68,6 @@
 #define PCI_ENDPOINT_TEST_FLAGS			0x2c
 #define FLAG_USE_DMA				BIT(0)
 
-#define PCI_DEVICE_ID_TI_J721E			0xb00d
 #define PCI_DEVICE_ID_TI_AM654			0xb00c
 #define PCI_DEVICE_ID_LS1088A			0x80c0
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..f968fcda338e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -881,6 +881,7 @@
 #define PCI_DEVICE_ID_TI_X620		0xac8d
 #define PCI_DEVICE_ID_TI_X420		0xac8e
 #define PCI_DEVICE_ID_TI_XX20_FM	0xac8f
+#define PCI_DEVICE_ID_TI_J721E		0xb00d
 #define PCI_DEVICE_ID_TI_DRA74x		0xb500
 #define PCI_DEVICE_ID_TI_DRA72x		0xb501
 
-- 
cgit v1.2.3


From cf6acb8bdb1d829b85a4daa2944bf9e71c93f4b9 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 22 Feb 2021 18:01:54 +0100
Subject: s390/cpumf: Add support for complete counter set extraction

Add support to the CPU Measurement counter facility device driver
to extract complete counter sets per CPU and per counter set from user
space. This includes a new device named /dev/hwctr and support
for the device driver functions open, close and ioctl. Other
functions are not supported.

The ioctl command supports 3 subcommands:
S390_HWCTR_START: enables counter sets on a list of CPUs.
S390_HWCTR_STOP: disables counter sets on a list of CPUs.
S390_HWCTR_READ: reads counter sets on a list of CPUs.

The ioctl(..., S390_HWCTR_READ, ...) is the only subcommand which
returns data.  It requires member data_bytes to be positive and
indicates the maximum amount of data available to store counter set
data. The other ioctl() subcommands do not use this member and it
should be set to zero.
The S390_HWCTR_READ subcommand returns the following data:

The cpuset data is flattened using the following scheme, stored in member
data:

 0x0       0x8   0xc       0x10  0x10      0x18  0x20  0x28         0xU-1
 +---------+-----+---------+-----+---------+-----+-----+------+------+
 | no_cpus | cpu | no_sets | set | no_cnts | cv1 | cv2 | .... | cv_n |
 +---------+-----+---------+-----+---------+-----+-----+------+------+

                           0xU   0xU+4     0xU+8 0xU+10             0xV-1
                           +-----+---------+-----+-----+------+------+
                           | set | no_cnts | cv1 | cv2 | .... | cv_n |
                           +-----+---------+-----+-----+------+------+

           0xV   0xV+4     0xV+8 0xV+c
           +-----+---------+-----+---------+-----+-----+------+------+
           | cpu | no_sets | set | no_cnts | cv1 | cv2 | .... | cv_n |
           +-----+---------+-----+---------+-----+-----+------+------+

U and V denote arbitrary hexadezimal addresses.
The first integer represents the number of CPUs data was extracted
from. This is followed by CPU number and number of counter sets extracted.
Both are two integer values. This is followed by the set identifer
and number of counters extracted. Both are two integer values. This is
followed by the counter values, each element is eight bytes in size.

The S390_HWCTR_READ ioctl subcommand is also limited to one call per
minute. This ensures that an application does not read out the
counter sets too often and reduces the overall CPU performance.
The complete counter set extraction is an expensive operation.

Reviewed-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/uapi/asm/perf_cpum_cf_diag.h |  51 +++
 arch/s390/kernel/perf_cpum_cf_diag.c           | 548 +++++++++++++++++++++++--
 include/linux/cpuhotplug.h                     |   1 +
 3 files changed, 577 insertions(+), 23 deletions(-)
 create mode 100644 arch/s390/include/uapi/asm/perf_cpum_cf_diag.h

(limited to 'include')

diff --git a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h
new file mode 100644
index 000000000000..3d8284b95f87
--- /dev/null
+++ b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2021
+ * Interface implementation for communication with the CPU Measurement
+ * counter facility device driver.
+ *
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ *
+ * Define for ioctl() commands to communicate with the CPU Measurement
+ * counter facility device driver.
+ */
+
+#ifndef _PERF_CPUM_CF_DIAG_H
+#define _PERF_CPUM_CF_DIAG_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define S390_HWCTR_DEVICE		"hwctr"
+#define S390_HWCTR_START_VERSION	1
+
+struct s390_ctrset_start {		/* Set CPUs to operate on */
+	__u64 version;			/* Version of interface */
+	__u64 data_bytes;		/* # of bytes required */
+	__u64 cpumask_len;		/* Length of CPU mask in bytes */
+	__u64 *cpumask;			/* Pointer to CPU mask */
+	__u64 counter_sets;		/* Bit mask of counter sets to get */
+};
+
+struct s390_ctrset_setdata {		/* Counter set data */
+	__u32 set;			/* Counter set number */
+	__u32 no_cnts;			/* # of counters stored in cv[] */
+	__u64 cv[0];			/* Counter values (variable length) */
+};
+
+struct s390_ctrset_cpudata {		/* Counter set data per CPU */
+	__u32 cpu_nr;			/* CPU number */
+	__u32 no_sets;			/* # of counters sets in data[] */
+	struct s390_ctrset_setdata data[0];
+};
+
+struct s390_ctrset_read {		/* Structure to get all ctr sets */
+	__u64 no_cpus;			/* Total # of CPUs data taken from */
+	struct s390_ctrset_cpudata data[0];
+};
+
+#define S390_HWCTR_MAGIC	'C'	/* Random magic # for ioctls */
+#define	S390_HWCTR_START	_IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start)
+#define	S390_HWCTR_STOP		_IO(S390_HWCTR_MAGIC, 2)
+#define	S390_HWCTR_READ		_IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read)
+#endif
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
index b5c86fb70d63..db4877bbb9aa 100644
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ b/arch/s390/kernel/perf_cpum_cf_diag.c
@@ -2,7 +2,7 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Sets
  *
- *  Copyright IBM Corp. 2019
+ *  Copyright IBM Corp. 2019, 2021
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
  *	       Thomas Richer <tmricht@linux.ibm.com>
  */
@@ -17,6 +17,8 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/processor.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
 
 #include <asm/ctl_reg.h>
 #include <asm/irq.h>
@@ -24,15 +26,20 @@
 #include <asm/timex.h>
 #include <asm/debug.h>
 
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+#include <asm/perf_cpum_cf_diag.h>
 
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+#define CF_DIAG_MIN_INTERVAL		60	/* Minimum counter set read */
+						/* interval in seconds */
+static unsigned long cf_diag_interval = CF_DIAG_MIN_INTERVAL;
 static unsigned int cf_diag_cpu_speed;
 static debug_info_t *cf_diag_dbg;
 
-struct cf_diag_csd {		/* Counter set data per CPU */
+struct cf_diag_csd {			/* Counter set data per CPU */
 	size_t used;			/* Bytes used in data/start */
 	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
 	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned int sets;		/* # Counter set saved in data */
 };
 static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
 
@@ -178,18 +185,35 @@ static void cf_diag_disable(struct pmu *pmu)
 
 /* Number of perf events counting hardware events */
 static atomic_t cf_diag_events = ATOMIC_INIT(0);
+/* Used to avoid races in calling reserve/release_cpumf_hardware */
+static DEFINE_MUTEX(cf_diag_reserve_mutex);
 
 /* Release the PMU if event is the last perf event */
 static void cf_diag_perf_event_destroy(struct perf_event *event)
 {
 	debug_sprintf_event(cf_diag_dbg, 5,
 			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, event->cpu,
+			    __func__, event, smp_processor_id(),
 			    atomic_read(&cf_diag_events));
 	if (atomic_dec_return(&cf_diag_events) == 0)
 		__kernel_cpumcf_end();
 }
 
+static int get_authctrsets(void)
+{
+	struct cpu_cf_events *cpuhw;
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	cpuhw = &get_cpu_var(cpu_cf_events);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	put_cpu_var(cpu_cf_events);
+	return auth;
+}
+
 /* Setup the event. Test for authorized counter sets and only include counter
  * sets which are authorized at the time of the setup. Including unauthorized
  * counter sets result in specification exception (and panic).
@@ -197,15 +221,12 @@ static void cf_diag_perf_event_destroy(struct perf_event *event)
 static int __hw_perf_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
-	struct cpu_cf_events *cpuhw;
-	enum cpumf_ctr_set i;
 	int err = 0;
 
 	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
 			    event, event->cpu);
 
 	event->hw.config = attr->config;
-	event->hw.config_base = 0;
 
 	/* Add all authorized counter sets to config_base. The
 	 * the hardware init function is either called per-cpu or just once
@@ -215,11 +236,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	 * Checking the authorization on any CPU is fine as the hardware
 	 * applies the same authorization settings to all CPUs.
 	 */
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			event->hw.config_base |= cpumf_ctr_ctl[i];
-	put_cpu_var(cpu_cf_events);
+	event->hw.config_base = get_authctrsets();
 
 	/* No authorized counter sets, nothing to count/sample */
 	if (!event->hw.config_base) {
@@ -237,6 +254,25 @@ out:
 	return err;
 }
 
+/* Return 0 if the CPU-measurement counter facility is currently free
+ * and an error otherwise.
+ */
+static int cf_diag_perf_event_inuse(void)
+{
+	int err = 0;
+
+	if (!atomic_inc_not_zero(&cf_diag_events)) {
+		mutex_lock(&cf_diag_reserve_mutex);
+		if (atomic_read(&cf_diag_events) == 0 &&
+		    __kernel_cpumcf_begin())
+			err = -EBUSY;
+		else
+			err = atomic_inc_return(&cf_diag_events);
+		mutex_unlock(&cf_diag_reserve_mutex);
+	}
+	return err;
+}
+
 static int cf_diag_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -264,13 +300,9 @@ static int cf_diag_event_init(struct perf_event *event)
 	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (atomic_inc_return(&cf_diag_events) == 1) {
-		if (__kernel_cpumcf_begin()) {
-			atomic_dec(&cf_diag_events);
-			err = -EBUSY;
-			goto out;
-		}
-	}
+	err = cf_diag_perf_event_inuse();
+	if (err < 0)
+		goto out;
 	event->destroy = cf_diag_perf_event_destroy;
 
 	err = __hw_perf_event_init(event);
@@ -599,6 +631,8 @@ static void cf_diag_del(struct perf_event *event, int flags)
 	cpuhw->flags &= ~PMU_F_IN_USE;
 }
 
+/* Default counter set events and format attribute groups */
+
 CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
 
 static struct attribute *cf_diag_events_attr[] = {
@@ -663,6 +697,452 @@ static void cf_diag_get_cpu_speed(void)
 	}
 }
 
+/* Code to create device and file I/O operations */
+static atomic_t ctrset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
+
+static int cf_diag_open(struct inode *inode, struct file *file)
+{
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (atomic_xchg(&ctrset_opencnt, 1))
+		return -EBUSY;
+
+	/* Avoid concurrent access with perf_event_open() system call */
+	mutex_lock(&cf_diag_reserve_mutex);
+	if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
+		err = -EBUSY;
+	mutex_unlock(&cf_diag_reserve_mutex);
+	if (err) {
+		atomic_set(&ctrset_opencnt, 0);
+		return err;
+	}
+	file->private_data = NULL;
+	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
+	/* nonseekable_open() never fails */
+	return nonseekable_open(inode, file);
+}
+
+/* Variables for ioctl() interface support */
+static DEFINE_MUTEX(cf_diag_ctrset_mutex);
+static struct cf_diag_ctrset {
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+	time64_t lastread;		/* Epoch counter set last read */
+} cf_diag_ctrset;
+
+static void cf_diag_ctrset_clear(void)
+{
+	cpumask_clear(&cf_diag_ctrset.mask);
+	cf_diag_ctrset.ctrset = 0;
+}
+
+static void cf_diag_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+
+	debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
+			    smp_processor_id());
+	lcctl(0);		/* Reset counter sets */
+	cpuhw->state = 0;	/* Save state in CPU hardware state */
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ * Since only one application is allowed to open the device, simple stop all
+ * CPU counter sets.
+ */
+static int cf_diag_release(struct inode *inode, struct file *file)
+{
+	on_each_cpu(cf_diag_release_cpu, NULL, 1);
+	cf_diag_ctrset_clear();
+	atomic_set(&ctrset_opencnt, 0);
+	__kernel_cpumcf_end();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
+	return 0;
+}
+
+struct cf_diag_call_on_cpu_parm {	/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
+				    __func__, cpu, csd->used);
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
+		if (rc)
+			return -EFAULT;
+		uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		return -EFAULT;
+	debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
+			    __func__, uptr - (void __user *)ctrset_read->data);
+	return 0;
+}
+
+static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				  int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s room %zd need %zd set %#x set_size %d\n",
+			    __func__, room, need, ctrset, ctrset_size);
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
+			    need, rc);
+	return need;
+}
+
+/* Read all counter sets. Since the perf_event_open() system call with
+ * event cpum_cf_diag/.../ is blocked when this interface is active, reuse
+ * the perf_event_open() data buffer to store the counter sets.
+ */
+static void cf_diag_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+	/* No data saved yet */
+	csd->used = 0;
+	csd->sets = 0;
+	memset(csd->data, 0, sizeof(csd->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cf_diag_ctrset_size(set, &cpuhw->info);
+		space = sizeof(csd->data) - csd->used;
+		space = cf_diag_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			csd->used += space;
+			csd->sets += 1;
+		}
+		debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
+				    __func__, sp, space);
+	}
+	debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
+			    csd->sets, csd->used);
+}
+
+static int cf_diag_all_read(unsigned long arg)
+{
+	struct cf_diag_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	time64_t now;
+	int rc = 0;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	now = ktime_get_seconds();
+	if (cf_diag_ctrset.lastread + cf_diag_interval > now) {
+		debug_sprintf_event(cf_diag_dbg, 5, "%s now %lld "
+				    " lastread %lld\n", __func__, now,
+				    cf_diag_ctrset.lastread);
+		rc = -EAGAIN;
+		goto out;
+	} else {
+		cf_diag_ctrset.lastread = now;
+	}
+	p.sets = cf_diag_ctrset.ctrset;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
+	rc = cf_diag_all_copy(arg, mask);
+out:
+	free_cpumask_var(mask);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
+	return rc;
+}
+
+/* Stop all counter sets via ioctl interface */
+static void cf_diag_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int rc;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+
+	ctr_set_multiple_disable(&cpuhw->state, p->sets);
+	ctr_set_multiple_stop(&cpuhw->state, p->sets);
+	rc = lcctl(cpuhw->state);		/* Stop counter sets */
+	if (!cpuhw->state)
+		cpuhw->flags &= ~PMU_F_IN_USE;
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s rc %d flags %#x state %#llx\n", __func__,
+			     rc, cpuhw->flags, cpuhw->state);
+}
+
+/* Start counter sets on particular CPU */
+static void cf_diag_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int rc;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+
+	if (!(cpuhw->flags & PMU_F_IN_USE))
+		cpuhw->state = 0;
+	cpuhw->flags |= PMU_F_IN_USE;
+	rc = lcctl(cpuhw->state);		/* Reset unused counter sets */
+	ctr_set_multiple_enable(&cpuhw->state, p->sets);
+	ctr_set_multiple_start(&cpuhw->state, p->sets);
+	rc |= lcctl(cpuhw->state);		/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
+			    __func__, rc, cpuhw->state);
+}
+
+static int cf_diag_all_stop(void)
+{
+	struct cf_diag_call_on_cpu_parm p = {
+		.sets = cf_diag_ctrset.ctrset,
+	};
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
+	free_cpumask_var(mask);
+	return 0;
+}
+
+static int cf_diag_all_start(void)
+{
+	struct cf_diag_call_on_cpu_parm p = {
+		.sets = cf_diag_ctrset.ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
+		rc = -EIO;
+	}
+	free_cpumask_var(mask);
+	return rc;
+}
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cf_diag_needspace(unsigned int sets)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
+			    bytes);
+	return bytes;
+}
+
+static long cf_diag_ioctl_read(unsigned long arg)
+{
+	struct s390_ctrset_read read;
+	int ret = 0;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+		return -EFAULT;
+	ret = cf_diag_all_read(arg);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl_stop(void)
+{
+	int ret;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	ret = cf_diag_all_stop();
+	cf_diag_ctrset_clear();
+	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl_start(unsigned long arg)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (cf_diag_ctrset.ctrset)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+	cpumask_clear(&cf_diag_ctrset.mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
+		return -EFAULT;
+	if (cpumask_empty(&cf_diag_ctrset.mask))
+		return -EINVAL;
+	need = cf_diag_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes))
+		ret = -EFAULT;
+	if (ret)
+		goto out;
+	cf_diag_ctrset.ctrset = start.counter_sets;
+	ret = cf_diag_all_start();
+out:
+	if (ret)
+		cf_diag_ctrset_clear();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
+			    __func__, cf_diag_ctrset.ctrset, need, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	int ret;
+
+	debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
+			    cmd, arg);
+	get_online_cpus();
+	mutex_lock(&cf_diag_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cf_diag_ioctl_start(arg);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cf_diag_ioctl_stop();
+		break;
+	case S390_HWCTR_READ:
+		ret = cf_diag_ioctl_read(arg);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	put_online_cpus();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static const struct file_operations cf_diag_fops = {
+	.owner = THIS_MODULE,
+	.open = cf_diag_open,
+	.release = cf_diag_release,
+	.unlocked_ioctl	= cf_diag_ioctl,
+	.compat_ioctl = cf_diag_ioctl,
+	.llseek = no_llseek
+};
+
+static struct miscdevice cf_diag_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cf_diag_fops,
+};
+
+static int cf_diag_online_cpu(unsigned int cpu)
+{
+	struct cf_diag_call_on_cpu_parm p;
+
+	mutex_lock(&cf_diag_ctrset_mutex);
+	if (!cf_diag_ctrset.ctrset)
+		goto out;
+	p.sets = cf_diag_ctrset.ctrset;
+	cf_diag_ioctl_on(&p);
+out:
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	return 0;
+}
+
+static int cf_diag_offline_cpu(unsigned int cpu)
+{
+	struct cf_diag_call_on_cpu_parm p;
+
+	mutex_lock(&cf_diag_ctrset_mutex);
+	if (!cf_diag_ctrset.ctrset)
+		goto out;
+	p.sets = cf_diag_ctrset.ctrset;
+	cf_diag_ioctl_off(&p);
+out:
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	return 0;
+}
+
 /* Initialize the counter set PMU to generate complete counter set data as
  * event raw data. This relies on the CPU Measurement Counter Facility device
  * already being loaded and initialized.
@@ -685,21 +1165,43 @@ static int __init cf_diag_init(void)
 		return -ENOMEM;
 	}
 
+	rc = misc_register(&cf_diag_dev);
+	if (rc) {
+		pr_err("Registration of /dev/" S390_HWCTR_DEVICE
+		       "failed rc=%d\n", rc);
+		goto out;
+	}
+
 	/* Setup s390dbf facility */
 	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
 	if (!cf_diag_dbg) {
 		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_dbf;
 	}
 	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
 
 	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
 	if (rc) {
-		debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-		debug_unregister(cf_diag_dbg);
 		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
 		       rc);
+		goto out_perf;
 	}
+	rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
+				       "perf/s390/cfd:online",
+				       cf_diag_online_cpu, cf_diag_offline_cpu);
+	if (!rc)
+		goto out;
+
+	pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
+	       rc);
+	perf_pmu_unregister(&cf_diag);
+out_perf:
+	debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
+	debug_unregister(cf_diag_dbg);
+out_dbf:
+	misc_deregister(&cf_diag_dev);
+out:
 	return rc;
 }
-arch_initcall(cf_diag_init);
+device_initcall(cf_diag_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index ee09a39627d6..9129ba231423 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -168,6 +168,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
+	CPUHP_AP_PERF_S390_CFD_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
-- 
cgit v1.2.3


From e54937963fa249595824439dc839c948188dea83 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Feb 2021 10:14:21 -0700
Subject: net: remove cmsg restriction from io_uring based send/recvmsg calls

No need to restrict these anymore, as the worker threads are direct
clones of the original task. Hence we know for a fact that we can
support anything that the regular task can.

Since the only user of proto_ops->flags was to flag PROTO_CMSG_DATA_ONLY,
kill the member and the flag definition too.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/net.h |  3 ---
 net/ipv4/af_inet.c  |  1 -
 net/ipv6/af_inet6.c |  1 -
 net/socket.c        | 10 ----------
 4 files changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 9e2324efc26a..ba736b457a06 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -42,8 +42,6 @@ struct net;
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
 
-#define PROTO_CMSG_DATA_ONLY	0x0001
-
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
  * enum sock_type - Socket types
@@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 
 struct proto_ops {
 	int		family;
-	unsigned int	flags;
 	struct module	*owner;
 	int		(*release)   (struct socket *sock);
 	int		(*bind)	     (struct socket *sock,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a02ce89b56b5..1355e6c0d567 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
 
 const struct proto_ops inet_stream_ops = {
 	.family		   = PF_INET,
-	.flags		   = PROTO_CMSG_DATA_ONLY,
 	.owner		   = THIS_MODULE,
 	.release	   = inet_release,
 	.bind		   = inet_bind,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1fb75f01756c..802f5111805a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 const struct proto_ops inet6_stream_ops = {
 	.family		   = PF_INET6,
-	.flags		   = PROTO_CMSG_DATA_ONLY,
 	.owner		   = THIS_MODULE,
 	.release	   = inet6_release,
 	.bind		   = inet6_bind,
diff --git a/net/socket.c b/net/socket.c
index 7f0617ab5437..90a60899aae5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2411,10 +2411,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
 			unsigned int flags)
 {
-	/* disallow ancillary data requests from this path */
-	if (msg->msg_control || msg->msg_controllen)
-		return -EINVAL;
-
 	return ____sys_sendmsg(sock, msg, flags, NULL, 0);
 }
 
@@ -2623,12 +2619,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 			struct user_msghdr __user *umsg,
 			struct sockaddr __user *uaddr, unsigned int flags)
 {
-	if (msg->msg_control || msg->msg_controllen) {
-		/* disallow ancillary data reqs unless cmsg is plain data */
-		if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
-			return -EINVAL;
-	}
-
 	return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
 }
 
-- 
cgit v1.2.3


From 1c0aa1fae1acb77c5f9917adb0e4cb4500b9f3a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Feb 2021 11:55:28 -0700
Subject: io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS

A few reasons to do this:

- The naming of the manager and worker have changed. That's a user visible
  change, so makes sense to flag it.

- Opening certain files that use ->signal (like /proc/self or /dev/tty)
  now works, and the flag tells the application upfront that this is the
  case.

- Related to the above, using signalfd will now work as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 2 +-
 include/uapi/linux/io_uring.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 378cf79e66c9..cf9a5fa1ad03 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9467,7 +9467,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
-			IORING_FEAT_EXT_ARG;
+			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac4e1738a9af..2514eb6b1cf2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -262,6 +262,7 @@ struct io_uring_params {
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
 #define IORING_FEAT_EXT_ARG		(1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
 
 /*
  * io_uring_register(2) opcodes and arguments
-- 
cgit v1.2.3


From 8a378fb096a7f02943c72a428bbfd0029260efb6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 12:27:49 -0700
Subject: io_uring: ensure io-wq context is always destroyed for tasks

If the task ends up doing no IO, the context list is empty and we don't
call into __io_uring_files_cancel() when the task exits. This can cause
a leak of the io-wq structures.

Ensure we always call __io_uring_files_cancel(), even if the task
context list is empty.

Fixes: 5aa75ed5b93f ("io_uring: tie async worker side to the task context")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 7 ++++---
 include/linux/io_uring.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e62ad6bde569..0a435a6f265a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8800,9 +8800,10 @@ void __io_uring_files_cancel(struct files_struct *files)
 
 	if (files) {
 		io_uring_remove_task_files(tctx);
-	} else if (tctx->io_wq && current->flags & PF_EXITING) {
-		io_wq_destroy(tctx->io_wq);
-		tctx->io_wq = NULL;
+		if (tctx->io_wq) {
+			io_wq_destroy(tctx->io_wq);
+			tctx->io_wq = NULL;
+		}
 	}
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index c48fcbdc2ea8..51ede771cd99 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -43,7 +43,7 @@ static inline void io_uring_task_cancel(void)
 }
 static inline void io_uring_files_cancel(struct files_struct *files)
 {
-	if (current->io_uring && !xa_empty(&current->io_uring->xa))
+	if (current->io_uring)
 		__io_uring_files_cancel(files);
 }
 static inline void io_uring_free(struct task_struct *tsk)
-- 
cgit v1.2.3


From 6120484ef2bd4ffea7d2f11d2f06167b8f848349 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 11 Feb 2021 21:17:01 +0100
Subject: ACPI: platform: Fix file references in comment

The referenced files are named slightly different. Replace '-' with '_'
and drop the .rst ending.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/platform_profile.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index a26542d53058..b4794027e759 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -12,9 +12,8 @@
 #include <linux/bitops.h>
 
 /*
- * If more options are added please update profile_names
- * array in platform-profile.c and sysfs-platform-profile.rst
- * documentation.
+ * If more options are added please update profile_names array in
+ * platform_profile.c and sysfs-platform_profile documentation.
  */
 
 enum platform_profile_option {
-- 
cgit v1.2.3


From 6c0b5e3fc6b536b125a66dfee103f3bc26d386f6 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 11 Feb 2021 21:17:02 +0100
Subject: ACPI: platform: Add balanced-performance platform profile

Some devices, including most Microsoft Surface devices, have a platform
profile somewhere inbetween balanced and performance. More specifically,
adding this profile allows the following mapping on Surface devices:

  Vendor Name           Platform Profile
  ------------------------------------------
  Battery Saver         low-power
  Recommended           balanced
  Better Performance    balanced-performance
  Best Performance      performance

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-platform_profile | 18 +++++++++++-------
 drivers/acpi/platform_profile.c                  |  1 +
 include/linux/platform_profile.h                 |  1 +
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-platform_profile b/Documentation/ABI/testing/sysfs-platform_profile
index 9d6b89b66cca..dae9c8941905 100644
--- a/Documentation/ABI/testing/sysfs-platform_profile
+++ b/Documentation/ABI/testing/sysfs-platform_profile
@@ -5,13 +5,17 @@ Description:	This file contains a space-separated list of profiles supported for
 
 		Drivers must use the following standard profile-names:
 
-		============	============================================
-		low-power	Low power consumption
-		cool		Cooler operation
-		quiet		Quieter operation
-		balanced	Balance between low power consumption and performance
-		performance	High performance operation
-		============	============================================
+		====================	========================================
+		low-power		Low power consumption
+		cool			Cooler operation
+		quiet			Quieter operation
+		balanced		Balance between low power consumption
+					and performance
+		balanced-performance	Balance between performance and low
+					power consumption with a slight bias
+					towards performance
+		performance		High performance operation
+		====================	========================================
 
 		Userspace may expect drivers to offer more than one of these
 		standard profile names.
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index 4a59c5993bde..dd2fbf38e414 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -17,6 +17,7 @@ static const char * const profile_names[] = {
 	[PLATFORM_PROFILE_COOL] = "cool",
 	[PLATFORM_PROFILE_QUIET] = "quiet",
 	[PLATFORM_PROFILE_BALANCED] = "balanced",
+	[PLATFORM_PROFILE_BALANCED_PERFORMANCE] = "balanced-performance",
 	[PLATFORM_PROFILE_PERFORMANCE] = "performance",
 };
 static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST);
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index b4794027e759..a6329003aee7 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -21,6 +21,7 @@ enum platform_profile_option {
 	PLATFORM_PROFILE_COOL,
 	PLATFORM_PROFILE_QUIET,
 	PLATFORM_PROFILE_BALANCED,
+	PLATFORM_PROFILE_BALANCED_PERFORMANCE,
 	PLATFORM_PROFILE_PERFORMANCE,
 	PLATFORM_PROFILE_LAST, /*must always be last */
 };
-- 
cgit v1.2.3


From abf4451b340b09f797c87341b3010f95af9215c0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 19 Jan 2021 20:45:08 +0000
Subject: dma-buf: heaps: Rework heap allocation hooks to return struct dma_buf
 instead of fd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every heap needs to create a dmabuf and then export it to a fd
via dma_buf_fd(), so to consolidate things a bit, have the heaps
just return a struct dmabuf * and let the top level
dma_heap_buffer_alloc() call handle creating the fd via
dma_buf_fd().

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor reword of commit message]

Link: https://patchwork.freedesktop.org/patch/msgid/20210119204508.9256-3-john.stultz@linaro.org
(cherry picked from commit c7f59e3dd60313071a989227dcb69094f499d310)
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/dma-buf/dma-heap.c          | 14 +++++++++++++-
 drivers/dma-buf/heaps/cma_heap.c    | 22 +++++++---------------
 drivers/dma-buf/heaps/system_heap.c | 21 +++++++--------------
 include/linux/dma-heap.h            | 12 ++++++------
 4 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index afd22c9dbdcf..6b5db954569f 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -52,6 +52,9 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 				 unsigned int fd_flags,
 				 unsigned int heap_flags)
 {
+	struct dma_buf *dmabuf;
+	int fd;
+
 	/*
 	 * Allocations from all heaps have to begin
 	 * and end on page boundaries.
@@ -60,7 +63,16 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 	if (!len)
 		return -EINVAL;
 
-	return heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	dmabuf = heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	if (IS_ERR(dmabuf))
+		return PTR_ERR(dmabuf);
+
+	fd = dma_buf_fd(dmabuf, fd_flags);
+	if (fd < 0) {
+		dma_buf_put(dmabuf);
+		/* just return, as put will call release and that will free */
+	}
+	return fd;
 }
 
 static int dma_heap_open(struct inode *inode, struct file *file)
diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c
index 364fc2f3e499..5d64eccd21d6 100644
--- a/drivers/dma-buf/heaps/cma_heap.c
+++ b/drivers/dma-buf/heaps/cma_heap.c
@@ -271,10 +271,10 @@ static const struct dma_buf_ops cma_heap_buf_ops = {
 	.release = cma_heap_dma_buf_release,
 };
 
-static int cma_heap_allocate(struct dma_heap *heap,
-				  unsigned long len,
-				  unsigned long fd_flags,
-				  unsigned long heap_flags)
+static struct dma_buf *cma_heap_allocate(struct dma_heap *heap,
+					 unsigned long len,
+					 unsigned long fd_flags,
+					 unsigned long heap_flags)
 {
 	struct cma_heap *cma_heap = dma_heap_get_drvdata(heap);
 	struct cma_heap_buffer *buffer;
@@ -289,7 +289,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -348,15 +348,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-
-	return ret;
+	return dmabuf;
 
 free_pages:
 	kfree(buffer->pages);
@@ -365,7 +357,7 @@ free_cma:
 free_buffer:
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops cma_heap_ops = {
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index 405351aad2a8..29e49ac17251 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -331,10 +331,10 @@ static struct page *alloc_largest_available(unsigned long size,
 	return NULL;
 }
 
-static int system_heap_allocate(struct dma_heap *heap,
-				unsigned long len,
-				unsigned long fd_flags,
-				unsigned long heap_flags)
+static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
+					    unsigned long len,
+					    unsigned long fd_flags,
+					    unsigned long heap_flags)
 {
 	struct system_heap_buffer *buffer;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
@@ -349,7 +349,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -399,14 +399,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-	return ret;
+	return dmabuf;
 
 free_pages:
 	for_each_sgtable_sg(table, sg, i) {
@@ -420,7 +413,7 @@ free_buffer:
 		__free_pages(page, compound_order(page));
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops system_heap_ops = {
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 454e354d1ffb..5bc5c946af58 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -16,15 +16,15 @@ struct dma_heap;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
- * @allocate:		allocate dmabuf and return fd
+ * @allocate:		allocate dmabuf and return struct dma_buf ptr
  *
- * allocate returns dmabuf fd  on success, -errno on error.
+ * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
  */
 struct dma_heap_ops {
-	int (*allocate)(struct dma_heap *heap,
-			unsigned long len,
-			unsigned long fd_flags,
-			unsigned long heap_flags);
+	struct dma_buf *(*allocate)(struct dma_heap *heap,
+				    unsigned long len,
+				    unsigned long fd_flags,
+				    unsigned long heap_flags);
 };
 
 /**
-- 
cgit v1.2.3


From f588f0c69e0e645225e4ebc1aff8f9677583a056 Mon Sep 17 00:00:00 2001
From: Veera Sundaram Sankaran <veeras@codeaurora.org>
Date: Fri, 15 Jan 2021 16:31:46 -0800
Subject: dma-fence: allow signaling drivers to set fence timestamp

Some drivers have hardware capability to get the precise HW timestamp
of certain events based on which the fences are triggered. The delta
between the event HW timestamp & current HW reference timestamp can
be used to calculate the timestamp in kernel's CLOCK_MONOTONIC time
domain. This allows it to set accurate timestamp factoring out any
software and IRQ latencies. Add a timestamp variant of fence signal
function, dma_fence_signal_timestamp to allow drivers to update the
precise timestamp for fences.

Changes in v2:
- Add a new fence signal variant instead of modifying fence struct

Changes in v3:
- Add timestamp domain information to commit-text and
dma_fence_signal_timestamp documentation

Signed-off-by: Veera Sundaram Sankaran <veeras@codeaurora.org>
Reviewed-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor parenthesis alignment]
Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-1-git-send-email-veeras@codeaurora.org
(cherry picked from commit 5a164ac4dbd21b82bcdc03186d40e455ff467fdc)
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/dma-buf/dma-fence.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/dma-fence.h   |  3 ++
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 7475e09b0680..d64fc03929be 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -312,22 +312,25 @@ void __dma_fence_might_wait(void)
 
 
 /**
- * dma_fence_signal_locked - signal completion of a fence
+ * dma_fence_signal_timestamp_locked - signal completion of a fence
  * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
  *
  * Signal completion for software callbacks on a fence, this will unblock
  * dma_fence_wait() calls and run all the callbacks added with
  * dma_fence_add_callback(). Can be called multiple times, but since a fence
  * can only go from the unsignaled to the signaled state and not back, it will
- * only be effective the first time.
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
  *
- * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
- * held.
+ * Unlike dma_fence_signal_timestamp(), this function must be called with
+ * &dma_fence.lock held.
  *
  * Returns 0 on success and a negative error value when @fence has been
  * signalled already.
  */
-int dma_fence_signal_locked(struct dma_fence *fence)
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp)
 {
 	struct dma_fence_cb *cur, *tmp;
 	struct list_head cb_list;
@@ -341,7 +344,7 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 	/* Stash the cb_list before replacing it with the timestamp */
 	list_replace(&fence->cb_list, &cb_list);
 
-	fence->timestamp = ktime_get();
+	fence->timestamp = timestamp;
 	set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags);
 	trace_dma_fence_signaled(fence);
 
@@ -352,6 +355,59 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 
 	return 0;
 }
+EXPORT_SYMBOL(dma_fence_signal_timestamp_locked);
+
+/**
+ * dma_fence_signal_timestamp - signal completion of a fence
+ * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!fence)
+		return -EINVAL;
+
+	spin_lock_irqsave(fence->lock, flags);
+	ret = dma_fence_signal_timestamp_locked(fence, timestamp);
+	spin_unlock_irqrestore(fence->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_fence_signal_timestamp);
+
+/**
+ * dma_fence_signal_locked - signal completion of a fence
+ * @fence: the fence to signal
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time.
+ *
+ * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
+ * held.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_locked(struct dma_fence *fence)
+{
+	return dma_fence_signal_timestamp_locked(fence, ktime_get());
+}
 EXPORT_SYMBOL(dma_fence_signal_locked);
 
 /**
@@ -379,7 +435,7 @@ int dma_fence_signal(struct dma_fence *fence)
 	tmp = dma_fence_begin_signalling();
 
 	spin_lock_irqsave(fence->lock, flags);
-	ret = dma_fence_signal_locked(fence);
+	ret = dma_fence_signal_timestamp_locked(fence, ktime_get());
 	spin_unlock_irqrestore(fence->lock, flags);
 
 	dma_fence_end_signalling(tmp);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 09e23adb351d..9f12efaaa93a 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -372,6 +372,9 @@ static inline void __dma_fence_might_wait(void) {}
 
 int dma_fence_signal(struct dma_fence *fence);
 int dma_fence_signal_locked(struct dma_fence *fence);
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp);
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp);
 signed long dma_fence_default_wait(struct dma_fence *fence,
 				   bool intr, signed long timeout);
 int dma_fence_add_callback(struct dma_fence *fence,
-- 
cgit v1.2.3


From 583065c7aa23d4bb0c298222c1128353a2007c9c Mon Sep 17 00:00:00 2001
From: Veera Sundaram Sankaran <veeras@codeaurora.org>
Date: Fri, 15 Jan 2021 16:31:47 -0800
Subject: drm/drm_vblank: set the dma-fence timestamp during send_vblank_event

The explicit out-fences in crtc are signaled as part of vblank event,
indicating all framebuffers present on the Atomic Commit request are
scanned out on the screen. Though the fence signal and the vblank event
notification happens at the same time, triggered by the same hardware
vsync event, the timestamp set in both are different. With drivers
supporting precise vblank timestamp the difference between the two
timestamps would be even higher. This might have an impact on use-mode
frameworks using these fence timestamps for purposes other than simple
buffer usage. For instance, the Android framework [1] uses the
retire-fences as an alternative to vblank when frame-updates are in
progress. Set the fence timestamp during send vblank event using a new
drm_send_event_timestamp_locked variant to avoid discrepancies.

[1] https://android.googlesource.com/platform/frameworks/native/+/master/
services/surfaceflinger/Scheduler/Scheduler.cpp#397

Changes in v2:
- Use drm_send_event_timestamp_locked to update fence timestamp
- add more information to commit text

Changes in v3:
- use same backend helper function for variants of drm_send_event to
avoid code duplications

Changes in v4:
- remove WARN_ON from drm_send_event_timestamp_locked

Signed-off-by: Veera Sundaram Sankaran <veeras@codeaurora.org>
Reviewed-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
  [sumits: minor parenthesis alignment correction]
Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-2-git-send-email-veeras@codeaurora.org
(cherry picked from commit a78e7a51d2fa9d2f482b462be4299784c884d988)
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/gpu/drm/drm_file.c   | 68 ++++++++++++++++++++++++++++++++++++--------
 drivers/gpu/drm/drm_vblank.c |  9 +++++-
 include/drm/drm_file.h       |  3 ++
 3 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 6b116bfd747c..7efbccffc2ea 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -775,20 +775,19 @@ void drm_event_cancel_free(struct drm_device *dev,
 EXPORT_SYMBOL(drm_event_cancel_free);
 
 /**
- * drm_send_event_locked - send DRM event to file descriptor
+ * drm_send_event_helper - send DRM event to file descriptor
  * @dev: DRM device
  * @e: DRM event to deliver
+ * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC
+ * time domain
  *
- * This function sends the event @e, initialized with drm_event_reserve_init(),
- * to its associated userspace DRM file. Callers must already hold
- * &drm_device.event_lock, see drm_send_event() for the unlocked version.
- *
- * Note that the core will take care of unlinking and disarming events when the
- * corresponding DRM file is closed. Drivers need not worry about whether the
- * DRM file for this event still exists and can call this function upon
- * completion of the asynchronous work unconditionally.
+ * This helper function sends the event @e, initialized with
+ * drm_event_reserve_init(), to its associated userspace DRM file.
+ * The timestamp variant of dma_fence_signal is used when the caller
+ * sends a valid timestamp.
  */
-void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e)
+void drm_send_event_helper(struct drm_device *dev,
+			   struct drm_pending_event *e, ktime_t timestamp)
 {
 	assert_spin_locked(&dev->event_lock);
 
@@ -799,7 +798,10 @@ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e)
 	}
 
 	if (e->fence) {
-		dma_fence_signal(e->fence);
+		if (timestamp)
+			dma_fence_signal_timestamp(e->fence, timestamp);
+		else
+			dma_fence_signal(e->fence);
 		dma_fence_put(e->fence);
 	}
 
@@ -814,6 +816,48 @@ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e)
 	wake_up_interruptible_poll(&e->file_priv->event_wait,
 		EPOLLIN | EPOLLRDNORM);
 }
+
+/**
+ * drm_send_event_timestamp_locked - send DRM event to file descriptor
+ * @dev: DRM device
+ * @e: DRM event to deliver
+ * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC
+ * time domain
+ *
+ * This function sends the event @e, initialized with drm_event_reserve_init(),
+ * to its associated userspace DRM file. Callers must already hold
+ * &drm_device.event_lock.
+ *
+ * Note that the core will take care of unlinking and disarming events when the
+ * corresponding DRM file is closed. Drivers need not worry about whether the
+ * DRM file for this event still exists and can call this function upon
+ * completion of the asynchronous work unconditionally.
+ */
+void drm_send_event_timestamp_locked(struct drm_device *dev,
+				     struct drm_pending_event *e, ktime_t timestamp)
+{
+	drm_send_event_helper(dev, e, timestamp);
+}
+EXPORT_SYMBOL(drm_send_event_timestamp_locked);
+
+/**
+ * drm_send_event_locked - send DRM event to file descriptor
+ * @dev: DRM device
+ * @e: DRM event to deliver
+ *
+ * This function sends the event @e, initialized with drm_event_reserve_init(),
+ * to its associated userspace DRM file. Callers must already hold
+ * &drm_device.event_lock, see drm_send_event() for the unlocked version.
+ *
+ * Note that the core will take care of unlinking and disarming events when the
+ * corresponding DRM file is closed. Drivers need not worry about whether the
+ * DRM file for this event still exists and can call this function upon
+ * completion of the asynchronous work unconditionally.
+ */
+void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e)
+{
+	drm_send_event_helper(dev, e, 0);
+}
 EXPORT_SYMBOL(drm_send_event_locked);
 
 /**
@@ -836,7 +880,7 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
 	unsigned long irqflags;
 
 	spin_lock_irqsave(&dev->event_lock, irqflags);
-	drm_send_event_locked(dev, e);
+	drm_send_event_helper(dev, e, 0);
 	spin_unlock_irqrestore(&dev->event_lock, irqflags);
 }
 EXPORT_SYMBOL(drm_send_event);
diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 30912d8f82a5..893165eeddf3 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -1006,7 +1006,14 @@ static void send_vblank_event(struct drm_device *dev,
 		break;
 	}
 	trace_drm_vblank_event_delivered(e->base.file_priv, e->pipe, seq);
-	drm_send_event_locked(dev, &e->base);
+	/*
+	 * Use the same timestamp for any associated fence signal to avoid
+	 * mismatch in timestamps for vsync & fence events triggered by the
+	 * same HW event. Frameworks like SurfaceFlinger in Android expects the
+	 * retire-fence timestamp to match exactly with HW vsync as it uses it
+	 * for its software vsync modeling.
+	 */
+	drm_send_event_timestamp_locked(dev, &e->base, now);
 }
 
 /**
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 716990bace10..b81b3bfb08c8 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -399,6 +399,9 @@ void drm_event_cancel_free(struct drm_device *dev,
 			   struct drm_pending_event *p);
 void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e);
 void drm_send_event(struct drm_device *dev, struct drm_pending_event *e);
+void drm_send_event_timestamp_locked(struct drm_device *dev,
+				     struct drm_pending_event *e,
+				     ktime_t timestamp);
 
 struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags);
 
-- 
cgit v1.2.3


From 3544de8ee6e4817278b15fe08658de49abf58954 Mon Sep 17 00:00:00 2001
From: Jacob Wen <jian.w.wen@oracle.com>
Date: Wed, 24 Feb 2021 12:00:55 -0800
Subject: mm, tracing: record slab name for kmem_cache_free()

Currently, a trace record generated by the RCU core is as below.

... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f3b49a66

It doesn't tell us what the RCU core has freed.

This patch adds the slab name to trace_kmem_cache_free().
The new format is as follows.

... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000037f79c8d name=dentry
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f78cb7b5 name=sock_inode_cache
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000018768985 name=pool_workqueue
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=000000006a6cb484 name=radix_tree_node

We can use it to understand what the RCU core is going to free. For
example, some users maybe interested in when the RCU core starts
freeing reclaimable slabs like dentry to reduce memory pressure.

Link: https://lkml.kernel.org/r/20201216072804.8838-1-jian.w.wen@oracle.com
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/kmem.h | 24 ++++++++++++++++--------
 mm/slab.c                   |  2 +-
 mm/slob.c                   |  2 +-
 mm/slub.c                   |  2 +-
 4 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f65b1f6db22d..40845b0d5dad 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -115,7 +115,7 @@ DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,
 	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
 );
 
-DECLARE_EVENT_CLASS(kmem_free,
+TRACE_EVENT(kfree,
 
 	TP_PROTO(unsigned long call_site, const void *ptr),
 
@@ -135,18 +135,26 @@ DECLARE_EVENT_CLASS(kmem_free,
 		  (void *)__entry->call_site, __entry->ptr)
 );
 
-DEFINE_EVENT(kmem_free, kfree,
+TRACE_EVENT(kmem_cache_free,
 
-	TP_PROTO(unsigned long call_site, const void *ptr),
+	TP_PROTO(unsigned long call_site, const void *ptr, const char *name),
 
-	TP_ARGS(call_site, ptr)
-);
+	TP_ARGS(call_site, ptr, name),
 
-DEFINE_EVENT(kmem_free, kmem_cache_free,
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	const char *,	name		)
+	),
 
-	TP_PROTO(unsigned long call_site, const void *ptr),
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->name		= name;
+	),
 
-	TP_ARGS(call_site, ptr)
+	TP_printk("call_site=%pS ptr=%p name=%s",
+		  (void *)__entry->call_site, __entry->ptr, __entry->name)
 );
 
 TRACE_EVENT(mm_page_free,
diff --git a/mm/slab.c b/mm/slab.c
index dcc55e78f353..2ed27849d1f2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3717,7 +3717,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 
-	trace_kmem_cache_free(_RET_IP_, objp);
+	trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
diff --git a/mm/slob.c b/mm/slob.c
index ef87ada8705d..0578429b991b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -673,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 		__kmem_cache_free(b, c->size);
 	}
 
-	trace_kmem_cache_free(_RET_IP_, b);
+	trace_kmem_cache_free(_RET_IP_, b, c->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
diff --git a/mm/slub.c b/mm/slub.c
index f5baf429654f..f4dc70228861 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3157,7 +3157,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 	if (!s)
 		return;
 	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
-	trace_kmem_cache_free(_RET_IP_, x);
+	trace_kmem_cache_free(_RET_IP_, x, s->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-- 
cgit v1.2.3


From 1f7ef657740344541645349a8bece90cbff898f5 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Wed, 24 Feb 2021 12:01:42 -0800
Subject: mm/filemap: remove unused parameter and change to void type for
 replace_page_cache_page()

Since commit 74d609585d8b ("page cache: Add and replace pages using the
XArray") was merged, the replace_page_cache_page() can not fail and always
return 0, we can remove the redundant return value and void it.  Moreover
remove the unused gfp_mask.

Link: https://lkml.kernel.org/r/609c30e5274ba15d8b90c872fd0d8ac437a9b2bb.1610071401.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c           | 6 +-----
 include/linux/pagemap.h | 2 +-
 mm/filemap.c            | 7 +------
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 588f8d1240aa..c6636b4c4ccf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (WARN_ON(PageMlocked(oldpage)))
 		goto out_fallback_unlock;
 
-	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
-	if (err) {
-		unlock_page(newpage);
-		goto out_put_old;
-	}
+	replace_page_cache_page(oldpage, newpage);
 
 	get_page(newpage);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d5570deff400..74e466e5a2ba 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -757,7 +757,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
+void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ff2a3fb0dc7..7dfed3454a2e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * replace_page_cache_page - replace a pagecache page with a new one
  * @old:	page to be replaced
  * @new:	page to replace with
- * @gfp_mask:	allocation mode
  *
  * This function replaces a page in the pagecache with a new one.  On
  * success it acquires the pagecache reference for the new page and
@@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * caller must do that.
  *
  * The remove + add is atomic.  This function cannot fail.
- *
- * Return: %0
  */
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_page(struct page *old, struct page *new)
 {
 	struct address_space *mapping = old->mapping;
 	void (*freepage)(struct page *) = mapping->a_ops->freepage;
@@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 	if (freepage)
 		freepage(old);
 	put_page(old);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-- 
cgit v1.2.3


From 4805462598113f350838d612d0895db2dbb3992b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Feb 2021 12:02:02 -0800
Subject: mm/filemap: pass a sleep state to put_and_wait_on_page_locked

This is prep work for the next patch, but I think at least one of the
current callers would prefer a killable sleep to an uninterruptible one.

Link: https://lkml.kernel.org/r/20210122160140.223228-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +--
 mm/filemap.c            | 7 +++++--
 mm/huge_memory.c        | 4 ++--
 mm/migrate.c            | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 74e466e5a2ba..bd629d676a27 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 
-extern void put_and_wait_on_page_locked(struct page *page);
-
+int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4fce9735e172..389a20eec9b8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1384,20 +1384,23 @@ static int wait_on_page_locked_async(struct page *page,
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
  *
  * The caller should hold a reference on @page.  They expect the page to
  * become unlocked relatively soon, but do not wish to hold up migration
  * (for example) by holding the reference while waiting for the page to
  * come unlocked.  After this function returns, the caller should not
  * dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
  */
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
 {
 	wait_queue_head_t *q;
 
 	page = compound_head(page);
 	q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+	return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
 }
 
 /**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91ca9b103ee5..f655137536eb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1439,7 +1439,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
@@ -1475,7 +1475,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 20ca887ea769..4ec727adfaa2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 	if (!get_page_unless_zero(page))
 		goto out;
 	pte_unmap_unlock(ptep, ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 out:
 	pte_unmap_unlock(ptep, ptl);
@@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	if (!get_page_unless_zero(page))
 		goto unlock;
 	spin_unlock(ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 unlock:
 	spin_unlock(ptl);
-- 
cgit v1.2.3


From 87fa0f3eb267eed966ee194907bc15376c1b758f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 24 Feb 2021 12:02:42 -0800
Subject: mm/filemap: rename generic_file_buffered_read to filemap_read

Rename generic_file_buffered_read to match the naming of filemap_fault,
also update the written parameter to a more descriptive name and improve
the kerneldoc comment.

Link: https://lkml.kernel.org/r/20210122160140.223228-18-willy@infradead.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/file.c    |  2 +-
 include/linux/fs.h |  4 ++--
 mm/filemap.c       | 35 ++++++++++++++++-------------------
 3 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index be9e3900cce8..bf2c51a9607a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 
-	return generic_file_buffered_read(iocb, to, ret);
+	return filemap_read(iocb, to, ret);
 }
 
 const struct file_operations btrfs_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 418b772d6eca..ec8f3ddf4a6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3080,8 +3080,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_write_check_limits(struct file *file, loff_t pos,
 		loff_t *count);
 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
-extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *to, ssize_t already_read);
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
+		ssize_t already_read);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1358cb061fd6..28c290da22c1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2394,23 +2394,20 @@ err:
 }
 
 /**
- * generic_file_buffered_read - generic file read routine
- * @iocb:	the iocb to read
- * @iter:	data destination
- * @written:	already copied
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
  *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
  *
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
- *
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
  */
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
@@ -2437,7 +2434,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		 * can no longer safely return -EIOCBQUEUED. Hence mark
 		 * an async read NOWAIT at that point.
 		 */
-		if ((iocb->ki_flags & IOCB_WAITQ) && written)
+		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
 		error = filemap_get_pages(iocb, iter, &pvec);
@@ -2497,7 +2494,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 			copied = copy_page_to_iter(page, offset, bytes, iter);
 
-			written += copied;
+			already_read += copied;
 			iocb->ki_pos += copied;
 			ra->prev_pos = iocb->ki_pos;
 
@@ -2514,9 +2511,9 @@ put_pages:
 
 	file_accessed(filp);
 
-	return written ? written : error;
+	return already_read ? already_read : error;
 }
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
 
 /**
  * generic_file_read_iter - generic filesystem read routine
@@ -2591,7 +2588,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 			goto out;
 	}
 
-	retval = generic_file_buffered_read(iocb, iter, retval);
+	retval = filemap_read(iocb, iter, retval);
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 2e9bd483159939ed2c0704b914294653c8341d25 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 24 Feb 2021 12:03:11 -0800
Subject: mm: memcg/slab: pre-allocate obj_cgroups for slab caches with
 SLAB_ACCOUNT

In general it's unknown in advance if a slab page will contain accounted
objects or not.  In order to avoid memory waste, an obj_cgroup vector is
allocated dynamically when a need to account of a new object arises.  Such
approach is memory efficient, but requires an expensive cmpxchg() to set
up the memcg/objcgs pointer, because an allocation can race with a
different allocation on another cpu.

But in some common cases it's known for sure that a slab page will contain
accounted objects: if the page belongs to a slab cache with a SLAB_ACCOUNT
flag set.  It includes such popular objects like vm_area_struct, anon_vma,
task_struct, etc.

In such cases we can pre-allocate the objcgs vector and simple assign it
to the page without any atomic operations, because at this early stage the
page is not visible to anyone else.

A very simplistic benchmark (allocating 10000000 64-bytes objects in a
row) shows ~15% win.  In the real life it seems that most workloads are
not very sensitive to the speed of (accounted) slab allocations.

[guro@fb.com: open-code set_page_objcgs() and add some comments, by Johannes]
  Link: https://lkml.kernel.org/r/20201113001926.GA2934489@carbon.dhcp.thefacebook.com
[akpm@linux-foundation.org: fix it for mm-slub-call-account_slab_page-after-slab-page-initialization-fix.patch]

Link: https://lkml.kernel.org/r/20201110195753.530157-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 -------------------
 mm/memcontrol.c            | 23 +++++++++++++++++++----
 mm/slab.c                  |  2 +-
 mm/slab.h                  | 14 ++++++++++----
 mm/slub.c                  |  2 +-
 5 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index eeb0b52203e9..7a4dd1cb19fe 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
-/*
- * set_page_objcgs - associate a page with a object cgroups vector
- * @page: a pointer to the page struct
- * @objcgs: a pointer to the object cgroups vector
- *
- * Atomically associates a page with a vector of object cgroups.
- */
-static inline bool set_page_objcgs(struct page *page,
-					struct obj_cgroup **objcgs)
-{
-	return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
-			MEMCG_DATA_OBJCGS);
-}
 #else
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
@@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 {
 	return NULL;
 }
-
-static inline bool set_page_objcgs(struct page *page,
-					struct obj_cgroup **objcgs)
-{
-	return true;
-}
 #endif
 
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b9bd354e97e..60ce452e42e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2935,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
-				 gfp_t gfp)
+				 gfp_t gfp, bool new_page)
 {
 	unsigned int objects = objs_per_slab_page(s, page);
+	unsigned long memcg_data;
 	void *vec;
 
 	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
@@ -2945,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 	if (!vec)
 		return -ENOMEM;
 
-	if (!set_page_objcgs(page, vec))
+	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+	if (new_page) {
+		/*
+		 * If the slab page is brand new and nobody can yet access
+		 * it's memcg_data, no synchronization is required and
+		 * memcg_data can be simply assigned.
+		 */
+		page->memcg_data = memcg_data;
+	} else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+		/*
+		 * If the slab page is already in use, somebody can allocate
+		 * and assign obj_cgroups in parallel. In this case the existing
+		 * objcg vector should be reused.
+		 */
 		kfree(vec);
-	else
-		kmemleak_not_leak(vec);
+		return 0;
+	}
 
+	kmemleak_not_leak(vec);
 	return 0;
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index cdad64b2836d..9c9e0c255c4b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 		return NULL;
 	}
 
-	account_slab_page(page, cachep->gfporder, cachep);
+	account_slab_page(page, cachep->gfporder, cachep, flags);
 	__SetPageSlab(page);
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
 	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
diff --git a/mm/slab.h b/mm/slab.h
index 48dc466a3791..076582f58f68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -238,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
 
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
-				 gfp_t gfp);
+				 gfp_t gfp, bool new_page);
 
 static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
@@ -315,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 			page = virt_to_head_page(p[i]);
 
 			if (!page_objcgs(page) &&
-			    memcg_alloc_page_obj_cgroups(page, s, flags)) {
+			    memcg_alloc_page_obj_cgroups(page, s, flags,
+							 false)) {
 				obj_cgroup_uncharge(objcg, obj_full_size(s));
 				continue;
 			}
@@ -379,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
 }
 
 static inline int memcg_alloc_page_obj_cgroups(struct page *page,
-					       struct kmem_cache *s, gfp_t gfp)
+					       struct kmem_cache *s, gfp_t gfp,
+					       bool new_page)
 {
 	return 0;
 }
@@ -420,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
 }
 
 static __always_inline void account_slab_page(struct page *page, int order,
-					      struct kmem_cache *s)
+					      struct kmem_cache *s,
+					      gfp_t gfp)
 {
+	if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+		memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+
 	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
 			    PAGE_SIZE << order);
 }
diff --git a/mm/slub.c b/mm/slub.c
index e0fef8f8d4bf..ca566361561e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1785,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	page->objects = oo_objects(oo);
 
-	account_slab_page(page, oo_order(oo), s);
+	account_slab_page(page, oo_order(oo), s, flags);
 
 	page->slab_cache = s;
 	__SetPageSlab(page);
-- 
cgit v1.2.3


From f3344adf38bdb3107d40483dd9501215ad40edce Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:15 -0800
Subject: mm: memcontrol: optimize per-lruvec stats counter memory usage

The vmstat threshold is 32 (MEMCG_CHARGE_BATCH), Actually the threshold
can be as big as MEMCG_CHARGE_BATCH * PAGE_SIZE.  It still fits into s32.
So introduce struct batched_lruvec_stat to optimize memory usage.

The size of struct lruvec_stat is 304 bytes on 64 bit systems.  As it is a
per-cpu structure.  So with this patch, we can save 304 / 2 * ncpu bytes
per-memcg per-node where ncpu is the number of the possible CPU.  If there
are c memory cgroup (include dying cgroup) and n NUMA node in the system.
Finally, we can save (152 * ncpu * c * n) bytes.

[akpm@linux-foundation.org: fix typo in comment]

Link: https://lkml.kernel.org/r/20201210042121.39665-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Chris Down <chris@chrisdown.name>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 ++++++++++++--
 mm/memcontrol.c            | 10 +++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7a4dd1cb19fe..41bbf71edd9f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -92,6 +92,10 @@ struct lruvec_stat {
 	long count[NR_VM_NODE_STAT_ITEMS];
 };
 
+struct batched_lruvec_stat {
+	s32 count[NR_VM_NODE_STAT_ITEMS];
+};
+
 /*
  * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
  * which have elements charged to this memcg.
@@ -107,11 +111,17 @@ struct memcg_shrinker_map {
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
-	/* Legacy local VM stats */
+	/*
+	 * Legacy local VM stats. This should be struct lruvec_stat and
+	 * cannot be optimized to struct batched_lruvec_stat. Because
+	 * the threshold of the lruvec_stat_cpu can be as big as
+	 * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
+	 * filed has no upper limit.
+	 */
 	struct lruvec_stat __percpu *lruvec_stat_local;
 
 	/* Subtree VM stats (batched updates) */
-	struct lruvec_stat __percpu *lruvec_stat_cpu;
+	struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
 	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 60ce452e42e6..b259e7d8ce41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5208,7 +5208,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 		return 1;
 	}
 
-	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
 					       GFP_KERNEL_ACCOUNT);
 	if (!pn->lruvec_stat_cpu) {
 		free_percpu(pn->lruvec_stat_local);
@@ -7093,6 +7093,14 @@ static int __init mem_cgroup_init(void)
 {
 	int cpu, node;
 
+	/*
+	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
+	 * used for per-memcg-per-cpu caching of per-node statistics. In order
+	 * to work fine, we should make sure that the overfill threshold can't
+	 * exceed S32_MAX / PAGE_SIZE.
+	 */
+	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
+
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 				  memcg_hotplug_cpu_dead);
 
-- 
cgit v1.2.3


From 69473e5de87389be6c0fa4a5d574a50c8f904fb3 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:23 -0800
Subject: mm: memcontrol: convert NR_ANON_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_ANON_THPS account to pages.  This patch is consistent
with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival").
Doing this also can make the unit of vmstat counters more unified.
Finally, the unit of the vmstat counters are pages, kB and bytes.  The
B/KB suffix can tell us that the unit is bytes or kB.  The rest which is
without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 15 +++++++++------
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h | 13 +++++++++++++
 mm/huge_memory.c       |  3 ++-
 mm/memcontrol.c        | 20 ++++++--------------
 mm/page_alloc.c        |  2 +-
 mm/rmap.c              |  6 +++---
 mm/vmstat.c            | 11 +++++++++--
 8 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 04f71c7bc3f8..6da0c3508bc9 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -461,8 +461,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     nid, K(sunreclaimable)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			     ,
-			     nid, K(node_page_state(pgdat, NR_ANON_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
@@ -519,10 +518,14 @@ static ssize_t node_read_vmstat(struct device *dev,
 				     sum_zone_numa_state(nid, i));
 
 #endif
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-		len += sysfs_emit_at(buf, len, "%s %lu\n",
-				     node_stat_name(i),
-				     node_page_state_pages(pgdat, i));
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+		unsigned long pages = node_page_state_pages(pgdat, i);
+
+		if (vmstat_item_print_in_thp(i))
+			pages /= HPAGE_PMD_NR;
+		len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i),
+				     pages);
+	}
 
 	return len;
 }
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d6fc74619625..a635c8a84ddf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -129,7 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	show_val_kb(m, "AnonHugePages:  ",
-		    global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_ANON_THPS));
 	show_val_kb(m, "ShmemHugePages: ",
 		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
 	show_val_kb(m, "ShmemPmdMapped: ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..67d50ef5dd20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -209,6 +209,19 @@ enum node_stat_item {
 	NR_VM_NODE_STAT_ITEMS
 };
 
+/*
+ * Returns true if the item should be printed in THPs (/proc/vmstat
+ * currently prints number of anon, file and shmem THPs. But the item
+ * is charged in pages).
+ */
+static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return false;
+
+	return item == NR_ANON_THPS;
+}
+
 /*
  * Returns true if the value is measured in bytes (most vmstat values are
  * measured in pages). This defines the API part, the internal representation
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f655137536eb..3691e863070a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2176,7 +2176,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		lock_page_memcg(page);
 		if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
 			/* Last compound_mapcount is gone. */
-			__dec_lruvec_page_state(page, NR_ANON_THPS);
+			__mod_lruvec_page_state(page, NR_ANON_THPS,
+						-HPAGE_PMD_NR);
 			if (TestClearPageDoubleMap(page)) {
 				/* No need in mapcount reference anymore */
 				for (i = 0; i < HPAGE_PMD_NR; i++)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e31e47e7bab2..b2405f049006 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1533,7 +1533,7 @@ static struct memory_stat memory_stats[] = {
 	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
 	 * constant(e.g. powerpc).
 	 */
-	{ "anon_thp", 0, NR_ANON_THPS },
+	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
 	{ "file_thp", 0, NR_FILE_THPS },
 	{ "shmem_thp", 0, NR_SHMEM_THPS },
 #endif
@@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_ANON_THPS ||
-		    memory_stats[i].idx == NR_FILE_THPS ||
+		if (memory_stats[i].idx == NR_FILE_THPS ||
 		    memory_stats[i].idx == NR_SHMEM_THPS)
 			memory_stats[i].ratio = HPAGE_PMD_SIZE;
 #endif
@@ -4087,10 +4086,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memcg1_stats[i] == NR_ANON_THPS)
-			nr *= HPAGE_PMD_NR;
-#endif
 		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
 	}
 
@@ -4121,10 +4116,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		nr = memcg_page_state(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memcg1_stats[i] == NR_ANON_THPS)
-			nr *= HPAGE_PMD_NR;
-#endif
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
 						(u64)nr * PAGE_SIZE);
 	}
@@ -5652,10 +5643,11 @@ static int mem_cgroup_move_account(struct page *page,
 			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
 			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
 			if (PageTransHuge(page)) {
-				__dec_lruvec_state(from_vec, NR_ANON_THPS);
-				__inc_lruvec_state(to_vec, NR_ANON_THPS);
+				__mod_lruvec_state(from_vec, NR_ANON_THPS,
+						   -nr_pages);
+				__mod_lruvec_state(to_vec, NR_ANON_THPS,
+						   nr_pages);
 			}
-
 		}
 	} else {
 		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef5070fed76b..2e2e47f8714b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5587,7 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
-			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_ANON_THPS)),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			node_page_state(pgdat, NR_KERNEL_STACK_KB),
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..c4d5c63cfd29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1144,7 +1144,7 @@ void do_page_add_anon_rmap(struct page *page,
 		 * disabled.
 		 */
 		if (compound)
-			__inc_lruvec_page_state(page, NR_ANON_THPS);
+			__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
 		__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
 	}
 
@@ -1186,7 +1186,7 @@ void page_add_new_anon_rmap(struct page *page,
 		if (hpage_pincount_available(page))
 			atomic_set(compound_pincount_ptr(page), 0);
 
-		__inc_lruvec_page_state(page, NR_ANON_THPS);
+		__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
 	} else {
 		/* Anon THP always mapped first with PMD */
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1292,7 +1292,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return;
 
-	__dec_lruvec_page_state(page, NR_ANON_THPS);
+	__mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
 
 	if (TestClearPageDoubleMap(page)) {
 		/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..07dc0af50cf0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1619,8 +1619,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	if (is_zone_first_populated(pgdat, zone)) {
 		seq_printf(m, "\n  per-node stats");
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+			unsigned long pages = node_page_state_pages(pgdat, i);
+
+			if (vmstat_item_print_in_thp(i))
+				pages /= HPAGE_PMD_NR;
 			seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
-				   node_page_state_pages(pgdat, i));
+				   pages);
 		}
 	}
 	seq_printf(m,
@@ -1740,8 +1744,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	v += NR_VM_NUMA_STAT_ITEMS;
 #endif
 
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 		v[i] = global_node_page_state_pages(i);
+		if (vmstat_item_print_in_thp(i))
+			v[i] /= HPAGE_PMD_NR;
+	}
 	v += NR_VM_NODE_STAT_ITEMS;
 
 	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
-- 
cgit v1.2.3


From bf9ecead53c89d3d2cf60acbc460174ebbcf0027 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:27 -0800
Subject: mm: memcontrol: convert NR_FILE_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with if hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_FILE_THPS account to pages.  This patch is consistent
with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival").
Doing this also can make the unit of vmstat counters more unified.
Finally, the unit of the vmstat counters are pages, kB and bytes.  The
B/KB suffix can tell us that the unit is bytes or kB.  The rest which is
without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 3 +--
 fs/proc/meminfo.c      | 2 +-
 include/linux/mmzone.h | 3 ++-
 mm/filemap.c           | 2 +-
 mm/huge_memory.c       | 5 ++++-
 mm/khugepaged.c        | 4 +++-
 mm/memcontrol.c        | 5 ++---
 7 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6da0c3508bc9..d5952f754911 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -466,8 +466,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
 				    HPAGE_PMD_NR),
-			     nid, K(node_page_state(pgdat, NR_FILE_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
 			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
 				    HPAGE_PMD_NR)
 #endif
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a635c8a84ddf..7ea4679880c8 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -135,7 +135,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
 	show_val_kb(m, "FileHugePages:  ",
-		    global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
 		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67d50ef5dd20..b751a9898bb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -219,7 +219,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return false;
 
-	return item == NR_ANON_THPS;
+	return item == NR_ANON_THPS ||
+	       item == NR_FILE_THPS;
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 820d81901eae..9efe059e2b58 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -208,7 +208,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
 		if (PageTransHuge(page))
 			__dec_lruvec_page_state(page, NR_SHMEM_THPS);
 	} else if (PageTransHuge(page)) {
-		__dec_lruvec_page_state(page, NR_FILE_THPS);
+		__mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
 		filemap_nr_thps_dec(mapping);
 	}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3691e863070a..77181faa2340 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2752,10 +2752,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
 		if (mapping) {
+			int nr = thp_nr_pages(head);
+
 			if (PageSwapBacked(head))
 				__dec_lruvec_page_state(head, NR_SHMEM_THPS);
 			else
-				__dec_lruvec_page_state(head, NR_FILE_THPS);
+				__mod_lruvec_page_state(head, NR_FILE_THPS,
+							-nr);
 		}
 
 		__split_huge_page(page, list, end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fb0fdaec34d5..c427fe2ca7ff 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1643,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm,
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
 	bool is_shmem = shmem_file(file);
+	int nr;
 
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1854,11 +1855,12 @@ out_unlock:
 		put_page(page);
 		goto xa_unlocked;
 	}
+	nr = thp_nr_pages(new_page);
 
 	if (is_shmem)
 		__inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
 	else {
-		__inc_lruvec_page_state(new_page, NR_FILE_THPS);
+		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2405f049006..fc552f57d3fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1534,7 +1534,7 @@ static struct memory_stat memory_stats[] = {
 	 * constant(e.g. powerpc).
 	 */
 	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
-	{ "file_thp", 0, NR_FILE_THPS },
+	{ "file_thp", PAGE_SIZE, NR_FILE_THPS },
 	{ "shmem_thp", 0, NR_SHMEM_THPS },
 #endif
 	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
@@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_FILE_THPS ||
-		    memory_stats[i].idx == NR_SHMEM_THPS)
+		if (memory_stats[i].idx == NR_SHMEM_THPS)
 			memory_stats[i].ratio = HPAGE_PMD_SIZE;
 #endif
 		VM_BUG_ON(!memory_stats[i].ratio);
-- 
cgit v1.2.3


From 57b2847d3c1dc154923578efb47a12302a57d700 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:31 -0800
Subject: mm: memcontrol: convert NR_SHMEM_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_SHMEM_THPS account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  3 +--
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h |  3 ++-
 mm/filemap.c           |  2 +-
 mm/huge_memory.c       |  3 ++-
 mm/khugepaged.c        |  2 +-
 mm/memcontrol.c        | 26 ++------------------------
 mm/page_alloc.c        |  2 +-
 mm/shmem.c             |  2 +-
 9 files changed, 12 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d5952f754911..6d5ac6ffb6e1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -462,8 +462,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			     ,
 			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
-			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7ea4679880c8..cfb107eaa3e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,7 +131,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "AnonHugePages:  ",
 		    global_node_page_state(NR_ANON_THPS));
 	show_val_kb(m, "ShmemHugePages: ",
-		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_SHMEM_THPS));
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
 	show_val_kb(m, "FileHugePages:  ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b751a9898bb6..788837f40b38 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -220,7 +220,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 		return false;
 
 	return item == NR_ANON_THPS ||
-	       item == NR_FILE_THPS;
+	       item == NR_FILE_THPS ||
+	       item == NR_SHMEM_THPS;
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 9efe059e2b58..46a8b9e82434 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -206,7 +206,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
 	if (PageSwapBacked(page)) {
 		__mod_lruvec_page_state(page, NR_SHMEM, -nr);
 		if (PageTransHuge(page))
-			__dec_lruvec_page_state(page, NR_SHMEM_THPS);
+			__mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
 	} else if (PageTransHuge(page)) {
 		__mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
 		filemap_nr_thps_dec(mapping);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 77181faa2340..86a3015ba54f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2755,7 +2755,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			int nr = thp_nr_pages(head);
 
 			if (PageSwapBacked(head))
-				__dec_lruvec_page_state(head, NR_SHMEM_THPS);
+				__mod_lruvec_page_state(head, NR_SHMEM_THPS,
+							-nr);
 			else
 				__mod_lruvec_page_state(head, NR_FILE_THPS,
 							-nr);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c427fe2ca7ff..75e246f680f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1858,7 +1858,7 @@ out_unlock:
 	nr = thp_nr_pages(new_page);
 
 	if (is_shmem)
-		__inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
+		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
 	else {
 		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc552f57d3fb..d3a0c59210e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1516,7 +1516,7 @@ struct memory_stat {
 	unsigned int idx;
 };
 
-static struct memory_stat memory_stats[] = {
+static const struct memory_stat memory_stats[] = {
 	{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
 	{ "file", PAGE_SIZE, NR_FILE_PAGES },
 	{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
@@ -1528,14 +1528,9 @@ static struct memory_stat memory_stats[] = {
 	{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
 	{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	/*
-	 * The ratio will be initialized in memory_stats_init(). Because
-	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
-	 * constant(e.g. powerpc).
-	 */
 	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
 	{ "file_thp", PAGE_SIZE, NR_FILE_THPS },
-	{ "shmem_thp", 0, NR_SHMEM_THPS },
+	{ "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS },
 #endif
 	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
 	{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
@@ -1560,23 +1555,6 @@ static struct memory_stat memory_stats[] = {
 	{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
 };
 
-static int __init memory_stats_init(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_SHMEM_THPS)
-			memory_stats[i].ratio = HPAGE_PMD_SIZE;
-#endif
-		VM_BUG_ON(!memory_stats[i].ratio);
-		VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
-	}
-
-	return 0;
-}
-pure_initcall(memory_stats_init);
-
 static char *memory_stat_format(struct mem_cgroup *memcg)
 {
 	struct seq_buf s;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e2e47f8714b..df292d8e659b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5584,7 +5584,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_WRITEBACK)),
 			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_ANON_THPS)),
diff --git a/mm/shmem.c b/mm/shmem.c
index 7924b3bf46fb..ff741d229701 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -713,7 +713,7 @@ next:
 		}
 		if (PageTransHuge(page)) {
 			count_vm_event(THP_FILE_ALLOC);
-			__inc_lruvec_page_state(page, NR_SHMEM_THPS);
+			__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
 		}
 		mapping->nrpages += nr;
 		__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
-- 
cgit v1.2.3


From a1528e21f8915e16252cda1137fe29672c918361 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:35 -0800
Subject: mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_SHMEM_PMDMAPPED account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  3 +--
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        |  3 +--
 mm/rmap.c              | 14 ++++++++++----
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6d5ac6ffb6e1..7a66aefe4e46 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -463,8 +463,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     ,
 			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
-			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
 			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
 				    HPAGE_PMD_NR)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index cfb107eaa3e6..c61f440570f9 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -133,7 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "ShmemHugePages: ",
 		    global_node_page_state(NR_SHMEM_THPS));
 	show_val_kb(m, "ShmemPmdMapped: ",
-		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_SHMEM_PMDMAPPED));
 	show_val_kb(m, "FileHugePages:  ",
 		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 788837f40b38..7bdbfeeb5c8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -221,7 +221,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 
 	return item == NR_ANON_THPS ||
 	       item == NR_FILE_THPS ||
-	       item == NR_SHMEM_THPS;
+	       item == NR_SHMEM_THPS ||
+	       item == NR_SHMEM_PMDMAPPED;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df292d8e659b..069561aadc7b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5585,8 +5585,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			K(node_page_state(pgdat, NR_SHMEM_THPS)),
-			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
-					* HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			K(node_page_state(pgdat, NR_ANON_THPS)),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/mm/rmap.c b/mm/rmap.c
index c4d5c63cfd29..1c1b576c0627 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1211,14 +1211,17 @@ void page_add_file_rmap(struct page *page, bool compound)
 	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
 	lock_page_memcg(page);
 	if (compound && PageTransHuge(page)) {
-		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+		int nr_pages = thp_nr_pages(page);
+
+		for (i = 0, nr = 0; i < nr_pages; i++) {
 			if (atomic_inc_and_test(&page[i]._mapcount))
 				nr++;
 		}
 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
 			goto out;
 		if (PageSwapBacked(page))
-			__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+						nr_pages);
 		else
 			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
@@ -1252,14 +1255,17 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 
 	/* page still mapped by someone else? */
 	if (compound && PageTransHuge(page)) {
-		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+		int nr_pages = thp_nr_pages(page);
+
+		for (i = 0, nr = 0; i < nr_pages; i++) {
 			if (atomic_add_negative(-1, &page[i]._mapcount))
 				nr++;
 		}
 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
 			return;
 		if (PageSwapBacked(page))
-			__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+						-nr_pages);
 		else
 			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
-- 
cgit v1.2.3


From 380780e71895ae301505ffcec8f954ab3666a4c7 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:39 -0800
Subject: mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_FILE_PMDMAPPED account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-7-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 3 +--
 fs/proc/meminfo.c      | 2 +-
 include/linux/mmzone.h | 3 ++-
 mm/rmap.c              | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 7a66aefe4e46..d02d86aec19f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -465,8 +465,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
-			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
-				    HPAGE_PMD_NR)
+			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
 #endif
 			    );
 	len += hugetlb_report_node_meminfo(buf, len, nid);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c61f440570f9..6fa761c9cc78 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -137,7 +137,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "FileHugePages:  ",
 		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
-		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_FILE_PMDMAPPED));
 #endif
 
 #ifdef CONFIG_CMA
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bdbfeeb5c8c..66d68e5d5a0f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -222,7 +222,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 	return item == NR_ANON_THPS ||
 	       item == NR_FILE_THPS ||
 	       item == NR_SHMEM_THPS ||
-	       item == NR_SHMEM_PMDMAPPED;
+	       item == NR_SHMEM_PMDMAPPED ||
+	       item == NR_FILE_PMDMAPPED;
 }
 
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 1c1b576c0627..5ebf16fae4b9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1223,7 +1223,8 @@ void page_add_file_rmap(struct page *page, bool compound)
 			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
 						nr_pages);
 		else
-			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+						nr_pages);
 	} else {
 		if (PageTransCompound(page) && page_mapping(page)) {
 			VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1267,7 +1268,8 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
 						-nr_pages);
 		else
-			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+						-nr_pages);
 	} else {
 		if (!atomic_add_negative(-1, &page->_mapcount))
 			return;
-- 
cgit v1.2.3


From b6038942480e574c697ea1a80019bbe586c1d654 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 24 Feb 2021 12:03:55 -0800
Subject: mm: memcg: add swapcache stat for memcg v2

This patch adds swapcache stat for the cgroup v2.  The swapcache
represents the memory that is accounted against both the memory and the
swap limit of the cgroup.  The main motivation behind exposing the
swapcache stat is for enabling users to gracefully migrate from cgroup
v1's memsw counter to cgroup v2's memory and swap counters.

Cgroup v1's memsw limit allows users to limit the memory+swap usage of a
workload but without control on the exact proportion of memory and swap.
Cgroup v2 provides separate limits for memory and swap which enables more
control on the exact usage of memory and swap individually for the
workload.

With some little subtleties, the v1's memsw limit can be switched with the
sum of the v2's memory and swap limits.  However the alternative for memsw
usage is not yet available in cgroup v2.  Exposing per-cgroup swapcache
stat enables that alternative.  Adding the memory usage and swap usage and
subtracting the swapcache will approximate the memsw usage.  This will
help in the transparent migration of the workloads depending on memsw
usage and limit to v2' memory and swap counters.

The reasons these applications are still interested in this approximate
memsw usage are: (1) these applications are not really interested in two
separate memory and swap usage metrics.  A single usage metric is more
simple to use and reason about for them.

(2) The memsw usage metric hides the underlying system's swap setup from
the applications.  Applications with multiple instances running in a
datacenter with heterogeneous systems (some have swap and some don't) will
keep seeing a consistent view of their usage.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]

Link: https://lkml.kernel.org/r/20210108155813.2914586-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  4 ++++
 drivers/base/node.c                     |  6 ++++++
 include/linux/mmzone.h                  |  3 +++
 include/linux/swap.h                    |  6 +++++-
 mm/memcontrol.c                         |  3 +++
 mm/migrate.c                            |  6 ++++++
 mm/swap_state.c                         | 28 ++--------------------------
 mm/vmstat.c                             |  3 +++
 8 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index c513eafaddea..9acc57f68f31 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1299,6 +1299,10 @@ PAGE_SIZE multiple when read back.
 		Amount of cached filesystem data that was modified and
 		is currently being written back to disk
 
+	  swapcached
+		Amount of swap cached in memory. The swapcache is accounted
+		against both memory and swap usage.
+
 	  anon_thp
 		Amount of memory used in anonymous mappings backed by
 		transparent hugepages
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d02d86aec19f..f449dbb2c746 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -372,14 +372,19 @@ static ssize_t node_read_meminfo(struct device *dev,
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct sysinfo i;
 	unsigned long sreclaimable, sunreclaimable;
+	unsigned long swapcached = 0;
 
 	si_meminfo_node(&i, nid);
 	sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
 	sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
+#ifdef CONFIG_SWAP
+	swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE);
+#endif
 	len = sysfs_emit_at(buf, len,
 			    "Node %d MemTotal:       %8lu kB\n"
 			    "Node %d MemFree:        %8lu kB\n"
 			    "Node %d MemUsed:        %8lu kB\n"
+			    "Node %d SwapCached:     %8lu kB\n"
 			    "Node %d Active:         %8lu kB\n"
 			    "Node %d Inactive:       %8lu kB\n"
 			    "Node %d Active(anon):   %8lu kB\n"
@@ -391,6 +396,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			    nid, K(i.totalram),
 			    nid, K(i.freeram),
 			    nid, K(i.totalram - i.freeram),
+			    nid, K(swapcached),
 			    nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
 				   node_page_state(pgdat, NR_ACTIVE_FILE)),
 			    nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 66d68e5d5a0f..fc99e9241846 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,6 +206,9 @@ enum node_stat_item {
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
+#ifdef CONFIG_SWAP
+	NR_SWAPCACHE,
+#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3f1f7ae0fbe9..0d64a2bc7e00 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,7 +408,11 @@ extern struct address_space *swapper_spaces[];
 #define swap_address_space(entry)			    \
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
-extern unsigned long total_swapcache_pages(void);
+static inline unsigned long total_swapcache_pages(void)
+{
+	return global_node_page_state(NR_SWAPCACHE);
+}
+
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
 extern void *get_shadow_from_swap_cache(swp_entry_t entry);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5435b370c929..58edfb6614b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1521,6 +1521,9 @@ static const struct memory_stat memory_stats[] = {
 	{ "file_mapped",		NR_FILE_MAPPED			},
 	{ "file_dirty",			NR_FILE_DIRTY			},
 	{ "file_writeback",		NR_WRITEBACK			},
+#ifdef CONFIG_SWAP
+	{ "swapcached",			NR_SWAPCACHE			},
+#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{ "anon_thp",			NR_ANON_THPS			},
 	{ "file_thp",			NR_FILE_THPS			},
diff --git a/mm/migrate.c b/mm/migrate.c
index 4ec727adfaa2..62b81d5257aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
 		}
+#ifdef CONFIG_SWAP
+		if (PageSwapCache(page)) {
+			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+		}
+#endif
 		if (dirty && mapping_can_writeback(mapping)) {
 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 807b00eae969..c1a648d9092b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,32 +68,6 @@ static struct {
 	unsigned long find_total;
 } swap_cache_info;
 
-unsigned long total_swapcache_pages(void)
-{
-	unsigned int i, j, nr;
-	unsigned long ret = 0;
-	struct address_space *spaces;
-	struct swap_info_struct *si;
-
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		swp_entry_t entry = swp_entry(i, 1);
-
-		/* Avoid get_swap_device() to warn for bad swap entry */
-		if (!swp_swap_info(entry))
-			continue;
-		/* Prevent swapoff to free swapper_spaces */
-		si = get_swap_device(entry);
-		if (!si)
-			continue;
-		nr = nr_swapper_spaces[i];
-		spaces = swapper_spaces[i];
-		for (j = 0; j < nr; j++)
-			ret += spaces[j].nrpages;
-		put_swap_device(si);
-	}
-	return ret;
-}
-
 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
 
 void show_swap_cache_info(void)
@@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
 		address_space->nrexceptional -= nr_shadows;
 		address_space->nrpages += nr;
 		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+		__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
 		ADD_CACHE_INFO(add_total, nr);
 unlock:
 		xas_unlock_irq(&xas);
@@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page,
 		address_space->nrexceptional += nr;
 	address_space->nrpages -= nr;
 	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+	__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
 	ADD_CACHE_INFO(del_total, nr);
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 07dc0af50cf0..a0e949542204 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = {
 	"nr_shadow_call_stack",
 #endif
 	"nr_page_table_pages",
+#ifdef CONFIG_SWAP
+	"nr_swapcached",
+#endif
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
-- 
cgit v1.2.3


From c1a660dea3fa616420606f1e206e6d22f7e05c30 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 24 Feb 2021 12:03:58 -0800
Subject: mm: kmem: make __memcg_kmem_(un)charge static

I've noticed that __memcg_kmem_charge() and __memcg_kmem_uncharge() are
not used anywhere except memcontrol.c.  Yet they are not declared as
non-static and are declared in memcontrol.h.

This patch makes them static.

Link: https://lkml.kernel.org/r/20210108020332.4096911-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  3 ---
 mm/memcontrol.c            | 11 ++++++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 41bbf71edd9f..7a38a1517a05 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1592,9 +1592,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-			unsigned int nr_pages);
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 58edfb6614b8..dce5291525a5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 #ifdef CONFIG_MEMCG_KMEM
 extern spinlock_t css_set_lock;
 
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			       unsigned int nr_pages);
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
+				  unsigned int nr_pages);
+
 static void obj_cgroup_release(struct percpu_ref *ref)
 {
 	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
@@ -3087,8 +3092,8 @@ static void memcg_free_cache_id(int id)
  *
  * Returns 0 on success, an error code on failure.
  */
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-			unsigned int nr_pages)
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			       unsigned int nr_pages)
 {
 	struct page_counter *counter;
 	int ret;
@@ -3120,7 +3125,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
  * @memcg: memcg to uncharge
  * @nr_pages: number of pages to uncharge
  */
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		page_counter_uncharge(&memcg->kmem, nr_pages);
-- 
cgit v1.2.3


From 802f1d522d5fdaefc2b935141bc8fe03d43a99ab Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 24 Feb 2021 12:04:01 -0800
Subject: mm: page_counter: re-layout structure to reduce false sharing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When checking a memory cgroup related performance regression [1], from the
perf c2c profiling data, we found high false sharing for accessing 'usage'
and 'parent'.

On 64 bit system, the 'usage' and 'parent' are close to each other, and
easy to be in one cacheline (for cacheline size == 64+ B).  'usage' is
usally written, while 'parent' is usually read as the cgroup's
hierarchical counting nature.

So move the 'parent' to the end of the structure to make sure they
are in different cache lines.

Following are some performance data with the patch, against v5.11-rc1.  [
In the data, A means a platform with 2 sockets 48C/96T, B is a platform of
4 sockests 72C/144T, and if a %stddev will be shown bigger than 2%,
P100/P50 means number of test tasks equals to 100%/50% of nr_cpu]

will-it-scale/malloc1
---------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P100	     15782 ±  2%      -0.1%      15765 ±  3%  will-it-scale.per_process_ops
A-P50	     21511            +8.9%      23432        will-it-scale.per_process_ops
B-P100	      9155            +2.2%       9357        will-it-scale.per_process_ops
B-P50	     10967            +7.1%      11751 ±  2%  will-it-scale.per_process_ops

will-it-scale/pagefault2
------------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P100	     79028            +3.0%      81411        will-it-scale.per_process_ops
A-P50	    183960 ±  2%      +4.4%     192078 ±  2%  will-it-scale.per_process_ops
B-P100	     85966            +9.9%      94467 ±  3%  will-it-scale.per_process_ops
B-P50	    198195            +9.8%     217526        will-it-scale.per_process_ops

fio (4k/1M is block size)
-------------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P50-r-4k     16881 ±  2%    +1.2%      17081 ±  2%  fio.read_bw_MBps
A-P50-w-4k      3931          +4.5%       4111 ±  2%  fio.write_bw_MBps
A-P50-r-1M     15178          -0.2%      15154        fio.read_bw_MBps
A-P50-w-1M      3924          +0.1%       3929        fio.write_bw_MBps

[1].https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/

Link: https://lkml.kernel.org/r/1611040814-33449-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_counter.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 85bd413e784e..679591301994 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -12,7 +12,6 @@ struct page_counter {
 	unsigned long low;
 	unsigned long high;
 	unsigned long max;
-	struct page_counter *parent;
 
 	/* effective memory.min and memory.min usage tracking */
 	unsigned long emin;
@@ -27,6 +26,14 @@ struct page_counter {
 	/* legacy */
 	unsigned long watermark;
 	unsigned long failcnt;
+
+	/*
+	 * 'parent' is placed here to be far from 'usage' to reduce
+	 * cache false sharing, as 'usage' is written mostly while
+	 * parent is frequently read for cgroup's hierarchical
+	 * counting nature.
+	 */
+	struct page_counter *parent;
 };
 
 #if BITS_PER_LONG == 32
-- 
cgit v1.2.3


From 6eeb104e114cb6b7391c2d69ff873403858c1f35 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Feb 2021 12:04:15 -0800
Subject: fs: buffer: use raw page_memcg() on locked page

alloc_page_buffers() currently uses get_mem_cgroup_from_page() for
charging the buffers to the page owner, which does an rcu-protected
page->memcg lookup and acquires a reference.  But buffer allocation has
the page lock held throughout, which pins the page to the memcg and
thereby the memcg - neither rcu nor holding an extra reference during the
allocation are necessary.  Use a raw page_memcg() instead.

This was the last user of get_mem_cgroup_from_page(), delete it.

Link: https://lkml.kernel.org/r/20210209190126.97842-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                |  4 ++--
 include/linux/memcontrol.h |  7 -------
 mm/memcontrol.c            | 23 -----------------------
 3 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index f1c3a5b27a90..0cb7ffd4977c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
-	memcg = get_mem_cgroup_from_page(page);
+	/* The page lock pins the memcg */
+	memcg = page_memcg(page);
 	old_memcg = set_active_memcg(memcg);
 
 	head = NULL;
@@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	}
 out:
 	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
 	return head;
 /*
  * In case anything failed, we just free everything we got.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7a38a1517a05..e6dc793d587d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -680,8 +680,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
-
 struct lruvec *lock_page_lruvec(struct page *page);
 struct lruvec *lock_page_lruvec_irq(struct page *page);
 struct lruvec *lock_page_lruvec_irqsave(struct page *page,
@@ -1191,11 +1189,6 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
-static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
-	return NULL;
-}
-
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c035846c7a4..7fdc001ce15f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1047,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
-/**
- * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
- * @page: page from which memcg should be extracted.
- *
- * Obtain a reference on page->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned.
- */
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	rcu_read_lock();
-	/* Page should not get uncharged and freed memcg under us. */
-	if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
-		memcg = root_mem_cgroup;
-	rcu_read_unlock();
-	return memcg;
-}
-EXPORT_SYMBOL(get_mem_cgroup_from_page);
-
 static __always_inline struct mem_cgroup *active_memcg(void)
 {
 	if (in_interrupt())
-- 
cgit v1.2.3


From 027b37b552f326aa94ef06c7ea77088b16c41e6e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 24 Feb 2021 12:05:46 -0800
Subject: kasan: move _RET_IP_ to inline wrappers

Generic mm functions that call KASAN annotations that might report a bug
pass _RET_IP_ to them as an argument. This allows KASAN to include the
name of the function that called the mm function in its report's header.

Now that KASAN has inline wrappers for all of its annotations, move
_RET_IP_ to those wrappers to simplify annotation call sites.

Link: https://linux-review.googlesource.com/id/I8fb3c06d49671305ee184175a39591bc26647a67
Link: https://lkml.kernel.org/r/5c1490eddf20b436b8c4eeea83fce47687d5e4a4.1610733117.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 20 +++++++++-----------
 mm/mempool.c          |  2 +-
 mm/slab.c             |  2 +-
 mm/slub.c             |  4 ++--
 4 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0aea9e2a2a01..a7254186558a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -185,19 +185,18 @@ static __always_inline void * __must_check kasan_init_slab_obj(
 }
 
 bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
-static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-						unsigned long ip)
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	if (kasan_enabled())
-		return __kasan_slab_free(s, object, ip);
+		return __kasan_slab_free(s, object, _RET_IP_);
 	return false;
 }
 
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+static __always_inline void kasan_slab_free_mempool(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_slab_free_mempool(ptr, ip);
+		__kasan_slab_free_mempool(ptr, _RET_IP_);
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
@@ -241,10 +240,10 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 }
 
 void __kasan_kfree_large(void *ptr, unsigned long ip);
-static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+static __always_inline void kasan_kfree_large(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_kfree_large(ptr, ip);
+		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
 bool kasan_save_enable_multi_shot(void);
@@ -277,12 +276,11 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-				   unsigned long ip)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
-static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
+static inline void kasan_slab_free_mempool(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
 {
@@ -302,7 +300,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
+static inline void kasan_kfree_large(void *ptr) {}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 624ed51b060f..79959fac27d7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_slab_free_mempool(element, _RET_IP_);
+		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 9c9e0c255c4b..35c68d99d460 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3420,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
 		memset(objp, 0, cachep->object_size);
 
 	/* Put the object into the quarantine, don't touch it for now. */
-	if (kasan_slab_free(cachep, objp, _RET_IP_))
+	if (kasan_slab_free(cachep, objp))
 		return;
 
 	/* Use KCSAN to help debug racy use-after-free. */
diff --git a/mm/slub.c b/mm/slub.c
index 046d7194acaf..b2833ce85c92 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1528,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 static __always_inline void kfree_hook(void *x)
 {
 	kmemleak_free(x);
-	kasan_kfree_large(x, _RET_IP_);
+	kasan_kfree_large(x);
 }
 
 static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
@@ -1558,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
 				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 
 	/* KASAN might put x into memory quarantine, delaying its reuse */
-	return kasan_slab_free(s, x, _RET_IP_);
+	return kasan_slab_free(s, x);
 }
 
 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
-- 
cgit v1.2.3


From 611806b4bf8dd97a4f3d73f5cf3c2c7730c51eb2 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 24 Feb 2021 12:05:50 -0800
Subject: kasan: fix bug detection via ksize for HW_TAGS mode

The currently existing kasan_check_read/write() annotations are intended
to be used for kernel modules that have KASAN compiler instrumentation
disabled. Thus, they are only relevant for the software KASAN modes that
rely on compiler instrumentation.

However there's another use case for these annotations: ksize() checks
that the object passed to it is indeed accessible before unpoisoning the
whole object. This is currently done via __kasan_check_read(), which is
compiled away for the hardware tag-based mode that doesn't rely on
compiler instrumentation. This leads to KASAN missing detecting some
memory corruptions.

Provide another annotation called kasan_check_byte() that is available
for all KASAN modes. As the implementation rename and reuse
kasan_check_invalid_free(). Use this new annotation in ksize().
To avoid having ksize() as the top frame in the reported stack trace
pass _RET_IP_ to __kasan_check_byte().

Also add a new ksize_uaf() test that checks that a use-after-free is
detected via ksize() itself, and via plain accesses that happen later.

Link: https://linux-review.googlesource.com/id/Iaabf771881d0f9ce1b969f2a62938e99d3308ec5
Link: https://lkml.kernel.org/r/f32ad74a60b28d8402482a38476f02bb7600f620.1610733117.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h |  6 ++++++
 include/linux/kasan.h        | 17 +++++++++++++++++
 lib/test_kasan.c             | 20 ++++++++++++++++++++
 mm/kasan/common.c            | 11 ++++++++++-
 mm/kasan/generic.c           |  4 ++--
 mm/kasan/kasan.h             | 10 +++++-----
 mm/kasan/sw_tags.c           |  6 +++---
 mm/slab_common.c             | 16 +++++++++-------
 8 files changed, 72 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ca5e89fb10d3..3d6d22a25bdc 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -4,6 +4,12 @@
 
 #include <linux/types.h>
 
+/*
+ * The annotations present in this file are only relevant for the software
+ * KASAN modes that rely on compiler instrumentation, and will be optimized
+ * away for the hardware tag-based KASAN mode. Use kasan_check_byte() instead.
+ */
+
 /*
  * __kasan_check_*: Always available when KASAN is enabled. This may be used
  * even in compilation units that selectively disable KASAN, but must use KASAN
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index a7254186558a..7eaf2d9effb4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -246,6 +246,19 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
+/*
+ * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
+ * the hardware tag-based mode that doesn't rely on compiler instrumentation.
+ */
+bool __kasan_check_byte(const void *addr, unsigned long ip);
+static __always_inline bool kasan_check_byte(const void *addr)
+{
+	if (kasan_enabled())
+		return __kasan_check_byte(addr, _RET_IP_);
+	return true;
+}
+
+
 bool kasan_save_enable_multi_shot(void);
 void kasan_restore_multi_shot(bool enabled);
 
@@ -301,6 +314,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 	return (void *)object;
 }
 static inline void kasan_kfree_large(void *ptr) {}
+static inline bool kasan_check_byte(const void *address)
+{
+	return true;
+}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index e59f185b8075..3f771fabd0ec 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -496,6 +496,7 @@ static void kasan_global_oob(struct kunit *test)
 	KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
 }
 
+/* Check that ksize() makes the whole object accessible. */
 static void ksize_unpoisons_memory(struct kunit *test)
 {
 	char *ptr;
@@ -514,6 +515,24 @@ static void ksize_unpoisons_memory(struct kunit *test)
 	kfree(ptr);
 }
 
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+	char *ptr;
+	int size = 128 - KASAN_GRANULE_SIZE;
+
+	ptr = kmalloc(size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+	kfree(ptr);
+
+	KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr);
+	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size));
+}
+
 static void kasan_stack_oob(struct kunit *test)
 {
 	char stack_array[10];
@@ -907,6 +926,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
 	KUNIT_CASE(kasan_alloca_oob_left),
 	KUNIT_CASE(kasan_alloca_oob_right),
 	KUNIT_CASE(ksize_unpoisons_memory),
+	KUNIT_CASE(ksize_uaf),
 	KUNIT_CASE(kmem_cache_double_free),
 	KUNIT_CASE(kmem_cache_invalid_free),
 	KUNIT_CASE(kasan_memchr),
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index eedc3e0fe365..b18189ef3a92 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -345,7 +345,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return false;
 
-	if (kasan_check_invalid_free(tagged_object)) {
+	if (!kasan_byte_accessible(tagged_object)) {
 		kasan_report_invalid_free(tagged_object, ip);
 		return true;
 	}
@@ -490,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
 		kasan_report_invalid_free(ptr, ip);
 	/* The object will be poisoned by kasan_free_pages(). */
 }
+
+bool __kasan_check_byte(const void *address, unsigned long ip)
+{
+	if (!kasan_byte_accessible(address)) {
+		kasan_report((unsigned long)address, 1, false, ip);
+		return false;
+	}
+	return true;
+}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index acab8862dc67..3f17a1218055 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -185,11 +185,11 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
 	return check_region_inline(addr, size, write, ret_ip);
 }
 
-bool kasan_check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
 {
 	s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
 
-	return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+	return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
 }
 
 void kasan_cache_shrink(struct kmem_cache *cache)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1298b79f9518..cc14b6e6c14c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -341,20 +341,20 @@ static inline void kasan_unpoison(const void *address, size_t size)
 			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
 }
 
-static inline bool kasan_check_invalid_free(void *addr)
+static inline bool kasan_byte_accessible(const void *addr)
 {
 	u8 ptr_tag = get_tag(addr);
-	u8 mem_tag = hw_get_mem_tag(addr);
+	u8 mem_tag = hw_get_mem_tag((void *)addr);
 
-	return (mem_tag == KASAN_TAG_INVALID) ||
-		(ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+	return (mem_tag != KASAN_TAG_INVALID) &&
+		(ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
 }
 
 #else /* CONFIG_KASAN_HW_TAGS */
 
 void kasan_poison(const void *address, size_t size, u8 value);
 void kasan_unpoison(const void *address, size_t size);
-bool kasan_check_invalid_free(void *addr);
+bool kasan_byte_accessible(const void *addr);
 
 #endif /* CONFIG_KASAN_HW_TAGS */
 
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index cc271fceb5d5..94c2d33be333 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -118,13 +118,13 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
 	return true;
 }
 
-bool kasan_check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
 {
 	u8 tag = get_tag(addr);
 	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
 
-	return (shadow_byte == KASAN_TAG_INVALID) ||
-		(tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+	return (shadow_byte != KASAN_TAG_INVALID) &&
+		(tag == KASAN_TAG_KERNEL || tag == shadow_byte);
 }
 
 #define DEFINE_HWASAN_LOAD_STORE(size)					\
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5be7825ad3ce..7c8298c17145 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1218,19 +1218,21 @@ size_t ksize(const void *objp)
 	size_t size;
 
 	/*
-	 * We need to check that the pointed to object is valid, and only then
-	 * unpoison the shadow memory below. We use __kasan_check_read(), to
-	 * generate a more useful report at the time ksize() is called (rather
-	 * than later where behaviour is undefined due to potential
-	 * use-after-free or double-free).
+	 * We need to first check that the pointer to the object is valid, and
+	 * only then unpoison the memory. The report printed from ksize() is
+	 * more useful, then when it's printed later when the behaviour could
+	 * be undefined due to a potential use-after-free or double-free.
 	 *
-	 * If the pointed to memory is invalid we return 0, to avoid users of
+	 * We use kasan_check_byte(), which is supported for the hardware
+	 * tag-based KASAN mode, unlike kasan_check_read/write().
+	 *
+	 * If the pointed to memory is invalid, we return 0 to avoid users of
 	 * ksize() writing to and potentially corrupting the memory region.
 	 *
 	 * We want to perform the check before __ksize(), to avoid potentially
 	 * crashing in __ksize() due to accessing invalid metadata.
 	 */
-	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
 		return 0;
 
 	size = __ksize(objp);
-- 
cgit v1.2.3


From 93f503c3fcd168a43e4a6c875fe2cfafaf8439dc Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:10 -0800
Subject: mm: fix prototype warning from kernel test robot

Patch series "mm: clean up names and parameters of memmap_init_xxxx functions", v5.

This patchset corrects inappropriate function names of memmap_init_xxx,
and simplify parameters of functions in the code flow.  And also fix a
prototype warning reported by lkp.

This patch (of 5);

Kernel test robot calling make with 'W=1' is triggering warning like
below for memmap_init_zone() function.

  mm/page_alloc.c:6259:23: warning: no previous prototype for 'memmap_init_zone' [-Wmissing-prototypes]
   6259 | void __meminit __weak memmap_init_zone(unsigned long size, int nid,
        |                       ^~~~~~~~~~~~~~~~

Fix it by adding the function declaration in include/linux/mm.h.  Since
memmap_init_zone() has a generic version with '__weak', the declaratoin in
ia64 header file can be simply removed.

Link: https://lkml.kernel.org/r/20210122135956.5946-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20210122135956.5946-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/pgtable.h | 6 ------
 include/linux/mm.h              | 2 ++
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 779b6972aa84..9b4efe89e62d 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -517,12 +517,6 @@ extern struct page *zero_page_memmap_ptr;
 	__changed;							\
 })
 #endif
-
-#  ifdef CONFIG_VIRTUAL_MEM_MAP
-  /* arch mem_map init routine is needed due to holes in a virtual mem_map */
-    extern void memmap_init (unsigned long size, int nid, unsigned long zone,
-			     unsigned long start_pfn);
-#  endif /* CONFIG_VIRTUAL_MEM_MAP */
 # endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7deb7168104d..1c1eabdf112c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2408,6 +2408,8 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
+extern void memmap_init(unsigned long size, int nid,
+		unsigned long zone, unsigned long range_start_pfn);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
-- 
cgit v1.2.3


From ab28cb6e1e5e59eb8bf3ad399133617414301d3a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:14 -0800
Subject: mm: rename memmap_init() and memmap_init_zone()

The current memmap_init_zone() only handles memory region inside one zone,
actually memmap_init() does the memmap init of one zone.  So rename both
of them accordingly.

Link: https://lkml.kernel.org/r/20210122135956.5946-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c | 6 +++---
 include/linux/mm.h  | 4 ++--
 mm/memory_hotplug.c | 2 +-
 mm/page_alloc.c     | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b19f47a5a305..39b782f62d7d 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -536,18 +536,18 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
 		    / sizeof(struct page));
 
 	if (map_start < map_end)
-		memmap_init_zone((unsigned long)(map_end - map_start),
+		memmap_init_range((unsigned long)(map_end - map_start),
 				 args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end),
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	return 0;
 }
 
 void __meminit
-memmap_init (unsigned long size, int nid, unsigned long zone,
+memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	     unsigned long start_pfn)
 {
 	if (!vmem_map) {
-		memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size,
+		memmap_init_range(size, nid, zone, start_pfn, start_pfn + size,
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	} else {
 		struct page *start;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c1eabdf112c..56d0eeae0ce3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2405,10 +2405,10 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn);
 #endif
 
 extern void set_dma_reserve(unsigned long new_dma_reserve);
-extern void memmap_init_zone(unsigned long, int, unsigned long,
+extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
-extern void memmap_init(unsigned long size, int nid,
+extern void memmap_init_zone(unsigned long size, int nid,
 		unsigned long zone, unsigned long range_start_pfn);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9be8c7..ddcb1cd24c60 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	 * expects the zone spans the pfn range. All the pages in the range
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
-	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
 			 MEMINIT_HOTPLUG, altmap, migratetype);
 
 	set_zone_contiguous(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 069561aadc7b..519cf52963eb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6121,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
  * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
  * zone stats (e.g., nr_isolate_pageblock) are touched.
  */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, unsigned long zone_end_pfn,
 		enum meminit_context context,
 		struct vmem_altmap *altmap, int migratetype)
@@ -6258,7 +6258,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-void __meminit __weak memmap_init(unsigned long size, int nid,
+void __meminit __weak memmap_init_zone(unsigned long size, int nid,
 				  unsigned long zone,
 				  unsigned long range_start_pfn)
 {
@@ -6272,7 +6272,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
 
 		if (end_pfn > start_pfn) {
 			size = end_pfn - start_pfn;
-			memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
+			memmap_init_range(size, nid, zone, start_pfn, range_end_pfn,
 					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 		}
 	}
@@ -6982,7 +6982,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
-		memmap_init(size, nid, j, zone_start_pfn);
+		memmap_init_zone(size, nid, j, zone_start_pfn);
 	}
 }
 
-- 
cgit v1.2.3


From 3256ff83c566235e812498ee1dc806c45a5d5af7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:17 -0800
Subject: mm: simplify parater of function memmap_init_zone()

As David suggested, simply passing 'struct zone *zone' is enough.  We can
get all needed information from 'struct zone*' easily.

Link: https://lkml.kernel.org/r/20210122135956.5946-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c | 12 +++++++-----
 include/linux/mm.h  |  3 +--
 mm/page_alloc.c     | 24 +++++++++++-------------
 3 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 39b782f62d7d..16d0d7d22657 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -542,12 +542,14 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
 	return 0;
 }
 
-void __meminit
-memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-	     unsigned long start_pfn)
+void __meminit memmap_init_zone(struct zone *zone)
 {
+	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long size = zone->spanned_pages;
+
 	if (!vmem_map) {
-		memmap_init_range(size, nid, zone, start_pfn, start_pfn + size,
+		memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size,
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	} else {
 		struct page *start;
@@ -557,7 +559,7 @@ memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		args.start = start;
 		args.end = start + size;
 		args.nid = nid;
-		args.zone = zone;
+		args.zone = zone_id;
 
 		efi_memmap_walk(virtual_memmap_init, &args);
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56d0eeae0ce3..2601a5cce0ef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2408,8 +2408,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
-extern void memmap_init_zone(unsigned long size, int nid,
-		unsigned long zone, unsigned long range_start_pfn);
+extern void memmap_init_zone(struct zone *zone);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519cf52963eb..aa04c54ad47c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6258,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-void __meminit __weak memmap_init_zone(unsigned long size, int nid,
-				  unsigned long zone,
-				  unsigned long range_start_pfn)
+void __meminit __weak memmap_init_zone(struct zone *zone)
 {
+	unsigned long zone_start_pfn = zone->zone_start_pfn;
+	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+	int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
 	unsigned long start_pfn, end_pfn;
-	unsigned long range_end_pfn = range_start_pfn + size;
-	int i;
 
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
-		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+		start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+		end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
 
-		if (end_pfn > start_pfn) {
-			size = end_pfn - start_pfn;
-			memmap_init_range(size, nid, zone, start_pfn, range_end_pfn,
-					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
-		}
+		if (end_pfn > start_pfn)
+			memmap_init_range(end_pfn - start_pfn, nid,
+					zone_id, start_pfn, zone_end_pfn,
+					MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	}
 }
 
@@ -6982,7 +6980,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
-		memmap_init_zone(size, nid, j, zone_start_pfn);
+		memmap_init_zone(zone);
 	}
 }
 
-- 
cgit v1.2.3


From a0cd7a7c4bc004587d1f4785a320f58e72d880eb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 24 Feb 2021 12:06:32 -0800
Subject: mm: simplify free_highmem_page() and free_reserved_page()

adjust_managed_page_count() as called by free_reserved_page() properly
handles pages in a highmem zone, so we can reuse it for
free_highmem_page().

We can now get rid of totalhigh_pages_inc() and simplify
free_reserved_page().

Link: https://lkml.kernel.org/r/20210126182113.19892-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem-internal.h |  5 -----
 include/linux/mm.h               | 16 ++--------------
 mm/page_alloc.c                  | 11 -----------
 3 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 1bbe96dc8be6..7902c7d8b55f 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -127,11 +127,6 @@ static inline unsigned long totalhigh_pages(void)
 	return (unsigned long)atomic_long_read(&_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_inc(void)
-{
-	atomic_long_inc(&_totalhigh_pages);
-}
-
 static inline void totalhigh_pages_add(long count)
 {
 	atomic_long_add(count, &_totalhigh_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2601a5cce0ef..76cab132c295 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2310,32 +2310,20 @@ extern void free_initmem(void);
 extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, const char *s);
 
-#ifdef	CONFIG_HIGHMEM
-/*
- * Free a highmem page into the buddy system, adjusting totalhigh_pages
- * and totalram_pages.
- */
-extern void free_highmem_page(struct page *page);
-#endif
-
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
 extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
-static inline void __free_reserved_page(struct page *page)
+static inline void free_reserved_page(struct page *page)
 {
 	ClearPageReserved(page);
 	init_page_count(page);
 	__free_page(page);
-}
-
-static inline void free_reserved_page(struct page *page)
-{
-	__free_reserved_page(page);
 	adjust_managed_page_count(page, 1);
 }
+#define free_highmem_page(page) free_reserved_page(page)
 
 static inline void mark_page_reserved(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 35e230cadb94..ddccc59f2f72 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7691,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	return pages;
 }
 
-#ifdef	CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
-	__free_reserved_page(page);
-	totalram_pages_inc();
-	atomic_long_inc(&page_zone(page)->managed_pages);
-	totalhigh_pages_inc();
-}
-#endif
-
-
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
-- 
cgit v1.2.3


From 3b2ebeaf98a028d5dd4ec63095855ef507920276 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Feb 2021 12:06:35 -0800
Subject: mm/gfp: add kernel-doc for gfp_t

The generated html will link to the definition of the gfp_t automatically
once we define it.  Move the one-paragraph overview of GFP flags from the
documentation directory into gfp.h and pull gfp.h into the documentation.

This generates warnings with clang
(https://lkml.kernel.org/r/20210219195509.GA59987@24bbad8f3778), so
use a #if 0 to hide it from the compiler for now.

Link: https://lkml.kernel.org/r/20210215204909.3824509-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20210220003049.GZ2858050@casper.infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/mm-api.rst |  7 ++-----
 include/linux/gfp.h               | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 2adffb3f7914..201b5423303b 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -19,11 +19,8 @@ User Space Memory Access
 Memory Allocation Controls
 ==========================
 
-Functions which need to allocate memory often use GFP flags to express
-how that memory should be allocated. The GFP acronym stands for "get
-free pages", the underlying memory allocation function. Not every GFP
-flag is allowed to every function which may allocate memory. Most
-users will want to use a plain ``GFP_KERNEL``.
+.. kernel-doc:: include/linux/gfp.h
+   :internal:
 
 .. kernel-doc:: include/linux/gfp.h
    :doc: Page mobility and placement hints
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 80544d5c08e7..220cd553a9e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,20 @@
 #include <linux/linkage.h>
 #include <linux/topology.h>
 
+/* The typedef is in types.h but we want the documentation here */
+#if 0
+/**
+ * typedef gfp_t - Memory allocation flags.
+ *
+ * GFP flags are commonly used throughout Linux to indicate how memory
+ * should be allocated.  The GFP acronym stands for get_free_pages(),
+ * the underlying memory allocation function.  Not every GFP flag is
+ * supported by every function which may allocate memory.  Most users
+ * will want to use a plain ``GFP_KERNEL``.
+ */
+typedef unsigned int __bitwise gfp_t;
+#endif
+
 struct vm_area_struct;
 
 /*
-- 
cgit v1.2.3


From 0fa5bc4023c188082024833b3deffd5543b93bc9 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 24 Feb 2021 12:07:12 -0800
Subject: mm/hugetlb: grab head page refcount once for group of subpages

Patch series "mm/hugetlb: follow_hugetlb_page() improvements", v2.

While looking at ZONE_DEVICE struct page reuse particularly the last
patch[0], I found two possible improvements for follow_hugetlb_page()
which is solely used for get_user_pages()/pin_user_pages().

The first patch batches page refcount updates while the second tidies up
storing the subpages/vmas.  Both together bring the cost of slow variant
of gup() cost from ~87.6k usecs to ~5.8k usecs.

libhugetlbfs tests seem to pass as well gup_test benchmarks with hugetlbfs
vmas.

This patch (of 2):

follow_hugetlb_page() once it locks the pmd/pud, checks all its N subpages
in a huge page and grabs a reference for each one.  Similar to gup-fast,
have follow_hugetlb_page() grab the head page refcount only after counting
all its subpages that are part of the just faulted huge page.

Consequently we reduce the number of atomics necessary to pin said huge
page, which improves non-fast gup() considerably:

  - 16G with 1G huge page size
  gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w

PIN_LONGTERM_BENCHMARK: ~87.6k us -> ~12.8k us

Link: https://lkml.kernel.org/r/20210128182632.24562-1-joao.m.martins@oracle.com
Link: https://lkml.kernel.org/r/20210128182632.24562-2-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/gup.c           |  5 ++---
 mm/hugetlb.c       | 43 ++++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76cab132c295..77e64e3eac80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1187,6 +1187,9 @@ static inline void get_page(struct page *page)
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
+__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
+						   unsigned int flags);
+
 
 static inline __must_check bool try_get_page(struct page *page)
 {
diff --git a/mm/gup.c b/mm/gup.c
index e4c224cd9661..e40579624f10 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
  * considered failure, and furthermore, a likely bug in the caller, so a warning
  * is also emitted.
  */
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
-							  int refs,
-							  unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+						   int refs, unsigned int flags)
 {
 	if (flags & FOLL_GET)
 		return try_get_compound_head(page, refs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6920c71d7e0d..d4fc5db6bc32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4796,7 +4796,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vaddr = *position;
 	unsigned long remainder = *nr_pages;
 	struct hstate *h = hstate_vma(vma);
-	int err = -EFAULT;
+	int err = -EFAULT, refs;
 
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -4916,26 +4916,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			continue;
 		}
 
+		refs = 0;
+
 same_page:
-		if (pages) {
+		if (pages)
 			pages[i] = mem_map_offset(page, pfn_offset);
-			/*
-			 * try_grab_page() should always succeed here, because:
-			 * a) we hold the ptl lock, and b) we've just checked
-			 * that the huge page is present in the page tables. If
-			 * the huge page is present, then the tail pages must
-			 * also be present. The ptl prevents the head page and
-			 * tail pages from being rearranged in any way. So this
-			 * page must be available at this point, unless the page
-			 * refcount overflowed:
-			 */
-			if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
-				spin_unlock(ptl);
-				remainder = 0;
-				err = -ENOMEM;
-				break;
-			}
-		}
 
 		if (vmas)
 			vmas[i] = vma;
@@ -4944,6 +4929,7 @@ same_page:
 		++pfn_offset;
 		--remainder;
 		++i;
+		++refs;
 		if (vaddr < vma->vm_end && remainder &&
 				pfn_offset < pages_per_huge_page(h)) {
 			/*
@@ -4951,6 +4937,25 @@ same_page:
 			 * of this compound page.
 			 */
 			goto same_page;
+		} else if (pages) {
+			/*
+			 * try_grab_compound_head() should always succeed here,
+			 * because: a) we hold the ptl lock, and b) we've just
+			 * checked that the huge page is present in the page
+			 * tables. If the huge page is present, then the tail
+			 * pages must also be present. The ptl prevents the
+			 * head page and tail pages from being rearranged in
+			 * any way. So this page must be available at this
+			 * point, unless the page refcount overflowed:
+			 */
+			if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1],
+								 refs,
+								 flags))) {
+				spin_unlock(ptl);
+				remainder = 0;
+				err = -ENOMEM;
+				break;
+			}
 		}
 		spin_unlock(ptl);
 	}
-- 
cgit v1.2.3


From 6c26d3108393211ecfd44d89404cfb744027bafd Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 24 Feb 2021 12:07:19 -0800
Subject: mm/hugetlb: fix some comment typos

Fix typos sasitfy to satisfy, reservtion to reservation, hugegpage to
hugepage and uniprocesor to uniprocessor in comments.

Link: https://lkml.kernel.org/r/20210128112028.64831-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 mm/hugetlb.c            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b5807f23caf8..ac3cb239a7ec 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -37,7 +37,7 @@ struct hugepage_subpool {
 	struct hstate *hstate;
 	long min_hpages;	/* Minimum huge pages or -1 if no minimum. */
 	long rsv_hpages;	/* Pages reserved against global pool to */
-				/* sasitfy minimum size. */
+				/* satisfy minimum size. */
 };
 
 struct resv_map {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf6653879bb4..fc895e25920c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1434,7 +1434,7 @@ static void __free_huge_page(struct page *page)
 	 * reservation.  If the page was associated with a subpool, there
 	 * would have been a page reserved in the subpool before allocation
 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
-	 * reservtion, do not call hugepage_subpool_put_pages() as this will
+	 * reservation, do not call hugepage_subpool_put_pages() as this will
 	 * remove the reserved page from the subpool.
 	 */
 	if (!restore_reserve) {
@@ -3707,7 +3707,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
@@ -4491,7 +4491,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
 }
 #else
 /*
- * For uniprocesor systems we always use a single mutex, so just
+ * For uniprocessor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
-- 
cgit v1.2.3


From bae84953815793f68ddd8edeadd3f4e32676a2c8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 24 Feb 2021 12:07:32 -0800
Subject: mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage
 support is disabled

Differentiate between hardware not supporting hugepages and user disabling
THP via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'

For the devdax namespace, the kernel handles the above via the
supported_alignment attribute and failing to initialize the namespace if
the namespace align value is not supported on the platform.

For the fsdax namespace, the kernel will continue to initialize the
namespace.  This can result in the kernel creating a huge pte entry even
though the hardware don't support the same.

We do want hugepage support with pmem even if the end-user disabled THP
via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled).  Hence
differentiate between hardware/firmware lacking support vs user-controlled
disable of THP and prevent a huge fault if the hardware lacks hugepage
support.

Link: https://lkml.kernel.org/r/20210205023956.417587-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 15 +++++++++------
 mm/huge_memory.c        |  6 +++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a19f35f836b..ba973efcd369 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
 }
 
 enum transparent_hugepage_flag {
+	TRANSPARENT_HUGEPAGE_NEVER_DAX,
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
@@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags;
  */
 static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
+
+	/*
+	 * If the hardware/firmware marked hugepage support disabled.
+	 */
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+		return false;
+
 	if (vma->vm_flags & VM_NOHUGEPAGE)
 		return false;
 
@@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
-	/*
-	 * For dax vmas, try to always use hugepage mappings. If the kernel does
-	 * not support hugepages, fsdax mappings will fallback to PAGE_SIZE
-	 * mappings, and device-dax namespaces, that try to guarantee a given
-	 * mapping size, will fail to enable
-	 */
+
 	if (vma_is_dax(vma))
 		return true;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4cdcc4da36e8..d77605c30f2e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -386,7 +386,11 @@ static int __init hugepage_init(void)
 	struct kobject *hugepage_kobj;
 
 	if (!has_transparent_hugepage()) {
-		transparent_hugepage_flags = 0;
+		/*
+		 * Hardware doesn't support hugepages, hence disable
+		 * DAX PMD support.
+		 */
+		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From c2135f7c570bc274035834848d9bf46ea89ba763 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Wed, 24 Feb 2021 12:08:01 -0800
Subject: mm/vmscan: __isolate_lru_page_prepare() cleanup

The function just returns 2 results, so using a 'switch' to deal with its
result is unnecessary.  Also simplify it to a bool func as Vlastimil
suggested.

Also remove 'goto' by reusing list_move(), and take Matthew Wilcox's
suggestion to update comments in function.

Link: https://lkml.kernel.org/r/728874d7-2d93-4049-68c1-dcc3b2d52ccd@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/compaction.c      |  2 +-
 mm/vmscan.c          | 68 ++++++++++++++++++++++++----------------------------
 3 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0d64a2bc7e00..32f665b1ee85 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
-extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
+extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccdaa6c19..8f52532675a9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -988,7 +988,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (unlikely(!get_page_unless_zero(page)))
 			goto isolate_fail;
 
-		if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+		if (!__isolate_lru_page_prepare(page, isolate_mode))
 			goto isolate_fail_put;
 
 		/* Try isolate the page */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..04509994aed4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1539,19 +1539,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  * page:	page to consider
  * mode:	one of the LRU isolation modes defined above
  *
- * returns 0 on success, -ve errno on failure.
+ * returns true on success, false on failure.
  */
-int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
+bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 {
-	int ret = -EBUSY;
-
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
-		return ret;
+		return false;
 
 	/* Compaction should not handle unevictable pages but CMA can do so */
 	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
-		return ret;
+		return false;
 
 	/*
 	 * To minimise LRU disruption, the caller can indicate that it only
@@ -1564,7 +1562,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 	if (mode & ISOLATE_ASYNC_MIGRATE) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
-			return ret;
+			return false;
 
 		if (PageDirty(page)) {
 			struct address_space *mapping;
@@ -1580,20 +1578,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 			 * from the page cache.
 			 */
 			if (!trylock_page(page))
-				return ret;
+				return false;
 
 			mapping = page_mapping(page);
 			migrate_dirty = !mapping || mapping->a_ops->migratepage;
 			unlock_page(page);
 			if (!migrate_dirty)
-				return ret;
+				return false;
 		}
 	}
 
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
-		return ret;
+		return false;
 
-	return 0;
+	return true;
 }
 
 /*
@@ -1677,35 +1675,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		 * only when the page is being freed somewhere else.
 		 */
 		scan += nr_pages;
-		switch (__isolate_lru_page_prepare(page, mode)) {
-		case 0:
-			/*
-			 * Be careful not to clear PageLRU until after we're
-			 * sure the page is not being freed elsewhere -- the
-			 * page release code relies on it.
-			 */
-			if (unlikely(!get_page_unless_zero(page)))
-				goto busy;
-
-			if (!TestClearPageLRU(page)) {
-				/*
-				 * This page may in other isolation path,
-				 * but we still hold lru_lock.
-				 */
-				put_page(page);
-				goto busy;
-			}
-
-			nr_taken += nr_pages;
-			nr_zone_taken[page_zonenum(page)] += nr_pages;
-			list_move(&page->lru, dst);
-			break;
+		if (!__isolate_lru_page_prepare(page, mode)) {
+			/* It is being freed elsewhere */
+			list_move(&page->lru, src);
+			continue;
+		}
+		/*
+		 * Be careful not to clear PageLRU until after we're
+		 * sure the page is not being freed elsewhere -- the
+		 * page release code relies on it.
+		 */
+		if (unlikely(!get_page_unless_zero(page))) {
+			list_move(&page->lru, src);
+			continue;
+		}
 
-		default:
-busy:
-			/* else it is being freed elsewhere */
+		if (!TestClearPageLRU(page)) {
+			/* Another thread is already isolating this page */
+			put_page(page);
 			list_move(&page->lru, src);
+			continue;
 		}
+
+		nr_taken += nr_pages;
+		nr_zone_taken[page_zonenum(page)] += nr_pages;
+		list_move(&page->lru, dst);
 	}
 
 	/*
-- 
cgit v1.2.3


From f90d8191ac864df33b1898bc7edc54eaa24e22bc Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:13 -0800
Subject: include/linux/mm_inline.h: shuffle lru list addition and deletion
 functions

These functions will call page_lru() in the following patches.  Move them
below page_lru() to avoid the forward declaration.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-3-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-3-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8fc71e9d7bb0..2889741f450a 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -45,27 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 #endif
 }
 
-static __always_inline void add_page_to_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add_tail(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void del_page_from_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	list_del(&page->lru);
-	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
-}
-
 /**
  * page_lru_base_type - which LRU list type should a page be on?
  * @page: the page to test
@@ -125,4 +104,25 @@ static __always_inline enum lru_list page_lru(struct page *page)
 	}
 	return lru;
 }
+
+static __always_inline void add_page_to_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+	list_add(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void del_page_from_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	list_del(&page->lru);
+	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
+}
 #endif
-- 
cgit v1.2.3


From 3a9c9788a3149d9745b7eb2eae811e57ef3b127c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:17 -0800
Subject: mm: don't pass "enum lru_list" to lru list addition functions

The "enum lru_list" parameter to add_page_to_lru_list() and
add_page_to_lru_list_tail() is redundant in the sense that it can
be extracted from the "struct page" parameter by page_lru().

A caveat is that we need to make sure PageActive() or
PageUnevictable() is correctly set or cleared before calling
these two functions. And they are indeed.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-4-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-4-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  8 ++++++--
 mm/swap.c                 | 15 +++++++--------
 mm/vmscan.c               |  6 ++----
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2889741f450a..130ba3201d3f 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,15 +106,19 @@ static __always_inline enum lru_list page_lru(struct page *page)
 }
 
 static __always_inline void add_page_to_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
+	enum lru_list lru = page_lru(page);
+
 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void add_page_to_lru_list_tail(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
+	enum lru_list lru = page_lru(page);
+
 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
 	list_add_tail(&page->lru, &lruvec->lists[lru]);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 2cca7141470c..22fad34b9c18 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -231,7 +231,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 	if (!PageUnevictable(page)) {
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		ClearPageActive(page);
-		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
+		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, thp_nr_pages(page));
 	}
 }
@@ -313,8 +313,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
 
 		del_page_from_lru_list(page, lruvec, lru);
 		SetPageActive(page);
-		lru += LRU_ACTIVE;
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 		trace_mm_lru_activate(page);
 
 		__count_vm_events(PGACTIVATE, nr_pages);
@@ -543,14 +542,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 		 * It can make readahead confusing.  But race window
 		 * is _really_ small and  it's non-critical problem.
 		 */
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 		SetPageReclaim(page);
 	} else {
 		/*
 		 * The page's writeback ends up during pagevec
 		 * We moves tha page into tail of inactive.
 		 */
-		add_page_to_lru_list_tail(page, lruvec, lru);
+		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, nr_pages);
 	}
 
@@ -570,7 +569,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 
 		__count_vm_events(PGDEACTIVATE, nr_pages);
 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -595,7 +594,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
 		 * anonymous pages
 		 */
 		ClearPageSwapBacked(page);
-		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+		add_page_to_lru_list(page, lruvec);
 
 		__count_vm_events(PGLAZYFREE, nr_pages);
 		__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -1005,7 +1004,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
 	}
 
-	add_page_to_lru_list(page, lruvec, lru);
+	add_page_to_lru_list(page, lruvec);
 	trace_mm_lru_insertion(page, lru);
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19875660e8f8..09e4f97488c9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1867,7 +1867,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 * inhibits memcg migration).
 		 */
 		VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
-		add_page_to_lru_list(page, lruvec, page_lru(page));
+		add_page_to_lru_list(page, lruvec);
 		nr_pages = thp_nr_pages(page);
 		nr_moved += nr_pages;
 		if (PageActive(page))
@@ -4282,12 +4282,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
-			enum lru_list lru = page_lru_base_type(page);
-
 			VM_BUG_ON_PAGE(PageActive(page), page);
 			ClearPageUnevictable(page);
 			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
-			add_page_to_lru_list(page, lruvec, lru);
+			add_page_to_lru_list(page, lruvec);
 			pgrescued += nr_pages;
 		}
 		SetPageLRU(page);
-- 
cgit v1.2.3


From 861404536a3af3c39f1b10959a40def3d8efa2dd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:21 -0800
Subject: mm/swap.c: don't pass "enum lru_list" to trace_mm_lru_insertion()

The parameter is redundant in the sense that it can be extracted
from the "struct page" parameter by page_lru() correctly.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-5-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-5-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/pagemap.h | 11 ++++-------
 mm/swap.c                      |  5 +----
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
index 8fd1babae761..e1735fe7c76a 100644
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -27,24 +27,21 @@
 
 TRACE_EVENT(mm_lru_insertion,
 
-	TP_PROTO(
-		struct page *page,
-		int lru
-	),
+	TP_PROTO(struct page *page),
 
-	TP_ARGS(page, lru),
+	TP_ARGS(page),
 
 	TP_STRUCT__entry(
 		__field(struct page *,	page	)
 		__field(unsigned long,	pfn	)
-		__field(int,		lru	)
+		__field(enum lru_list,	lru	)
 		__field(unsigned long,	flags	)
 	),
 
 	TP_fast_assign(
 		__entry->page	= page;
 		__entry->pfn	= page_to_pfn(page);
-		__entry->lru	= lru;
+		__entry->lru	= page_lru(page);
 		__entry->flags	= trace_pagemap_flags(page);
 	),
 
diff --git a/mm/swap.c b/mm/swap.c
index 22fad34b9c18..a26e3a4fe17b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -957,7 +957,6 @@ EXPORT_SYMBOL(__pagevec_release);
 
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 {
-	enum lru_list lru;
 	int was_unevictable = TestClearPageUnevictable(page);
 	int nr_pages = thp_nr_pages(page);
 
@@ -993,11 +992,9 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 	smp_mb__after_atomic();
 
 	if (page_evictable(page)) {
-		lru = page_lru(page);
 		if (was_unevictable)
 			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
 	} else {
-		lru = LRU_UNEVICTABLE;
 		ClearPageActive(page);
 		SetPageUnevictable(page);
 		if (!was_unevictable)
@@ -1005,7 +1002,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 	}
 
 	add_page_to_lru_list(page, lruvec);
-	trace_mm_lru_insertion(page, lru);
+	trace_mm_lru_insertion(page);
 }
 
 /*
-- 
cgit v1.2.3


From 46ae6b2cc2a47904a368d238425531ea91f3a2a5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:25 -0800
Subject: mm/swap.c: don't pass "enum lru_list" to del_page_from_lru_list()

The parameter is redundant in the sense that it can be potentially
extracted from the "struct page" parameter by page_lru(). We need to
make sure that existing PageActive() or PageUnevictable() remains
until the function returns. A few places don't conform, and simple
reordering fixes them.

This patch may have left page_off_lru() seemingly odd, and we'll take
care of it in the next patch.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-6-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  5 +++--
 mm/compaction.c           |  2 +-
 mm/mlock.c                |  3 +--
 mm/swap.c                 | 26 ++++++++++----------------
 mm/vmscan.c               |  4 ++--
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 130ba3201d3f..ffacc6273678 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -124,9 +124,10 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
 	list_del(&page->lru);
-	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
+	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+			-thp_nr_pages(page));
 }
 #endif
diff --git a/mm/compaction.c b/mm/compaction.c
index 8f52532675a9..3aef10c61d0e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1034,7 +1034,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			low_pfn += compound_nr(page) - 1;
 
 		/* Successfully isolated */
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		mod_node_page_state(page_pgdat(page),
 				NR_ISOLATED_ANON + page_is_file_lru(page),
 				thp_nr_pages(page));
diff --git a/mm/mlock.c b/mm/mlock.c
index 55b3b3672977..73960bb3464d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			 */
 			if (TestClearPageLRU(page)) {
 				lruvec = relock_page_lruvec_irq(page, lruvec);
-				del_page_from_lru_list(page, lruvec,
-							page_lru(page));
+				del_page_from_lru_list(page, lruvec);
 				continue;
 			} else
 				__munlock_isolation_failed(page);
diff --git a/mm/swap.c b/mm/swap.c
index a26e3a4fe17b..ff910f636df2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -85,7 +85,8 @@ static void __page_cache_release(struct page *page)
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		__ClearPageLRU(page);
-		del_page_from_lru_list(page, lruvec, page_off_lru(page));
+		del_page_from_lru_list(page, lruvec);
+		page_off_lru(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
@@ -229,7 +230,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (!PageUnevictable(page)) {
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, thp_nr_pages(page));
@@ -308,10 +309,9 @@ void lru_note_cost_page(struct page *page)
 static void __activate_page(struct page *page, struct lruvec *lruvec)
 {
 	if (!PageActive(page) && !PageUnevictable(page)) {
-		int lru = page_lru_base_type(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec, lru);
+		del_page_from_lru_list(page, lruvec);
 		SetPageActive(page);
 		add_page_to_lru_list(page, lruvec);
 		trace_mm_lru_activate(page);
@@ -518,8 +518,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
  */
 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 {
-	int lru;
-	bool active;
+	bool active = PageActive(page);
 	int nr_pages = thp_nr_pages(page);
 
 	if (PageUnevictable(page))
@@ -529,10 +528,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 	if (page_mapped(page))
 		return;
 
-	active = PageActive(page);
-	lru = page_lru_base_type(page);
-
-	del_page_from_lru_list(page, lruvec, lru + active);
+	del_page_from_lru_list(page, lruvec);
 	ClearPageActive(page);
 	ClearPageReferenced(page);
 
@@ -563,10 +559,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (PageActive(page) && !PageUnevictable(page)) {
-		int lru = page_lru_base_type(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
 		add_page_to_lru_list(page, lruvec);
@@ -581,11 +576,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (PageAnon(page) && PageSwapBacked(page) &&
 	    !PageSwapCache(page) && !PageUnevictable(page)) {
-		bool active = PageActive(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec,
-				       LRU_INACTIVE_ANON + active);
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
 		/*
@@ -919,7 +912,8 @@ void release_pages(struct page **pages, int nr)
 
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			__ClearPageLRU(page);
-			del_page_from_lru_list(page, lruvec, page_off_lru(page));
+			del_page_from_lru_list(page, lruvec);
+			page_off_lru(page);
 		}
 
 		__ClearPageWaiters(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 09e4f97488c9..7c65d47e6612 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1766,7 +1766,7 @@ int isolate_lru_page(struct page *page)
 
 		get_page(page);
 		lruvec = lock_page_lruvec_irq(page);
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		unlock_page_lruvec_irq(lruvec);
 		ret = 0;
 	}
@@ -4283,8 +4283,8 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
 			VM_BUG_ON_PAGE(PageActive(page), page);
+			del_page_from_lru_list(page, lruvec);
 			ClearPageUnevictable(page);
-			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
 			add_page_to_lru_list(page, lruvec);
 			pgrescued += nr_pages;
 		}
-- 
cgit v1.2.3


From 875601796267214f286d3581fe74f2805d060fe8 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:28 -0800
Subject: mm: add __clear_page_lru_flags() to replace page_off_lru()

Similar to page_off_lru(), the new function does non-atomic clearing
of PageLRU() in addition to PageActive() and PageUnevictable(), on a
page that has no references left.

If PageActive() and PageUnevictable() are both set, refuse to clear
either and leave them to bad_page(). This is a behavior change that
is meant to help debug.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-7-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 28 ++++++++++------------------
 mm/swap.c                 |  6 ++----
 mm/vmscan.c               |  3 +--
 3 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ffacc6273678..ef3fd79222e5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -61,27 +61,19 @@ static inline enum lru_list page_lru_base_type(struct page *page)
 }
 
 /**
- * page_off_lru - which LRU list was page on? clearing its lru flags.
- * @page: the page to test
- *
- * Returns the LRU list a page was on, as an index into the array of LRU
- * lists; and clears its Unevictable or Active flags, ready for freeing.
+ * __clear_page_lru_flags - clear page lru flags before releasing a page
+ * @page: the page that was on lru and now has a zero reference
  */
-static __always_inline enum lru_list page_off_lru(struct page *page)
+static __always_inline void __clear_page_lru_flags(struct page *page)
 {
-	enum lru_list lru;
+	__ClearPageLRU(page);
 
-	if (PageUnevictable(page)) {
-		__ClearPageUnevictable(page);
-		lru = LRU_UNEVICTABLE;
-	} else {
-		lru = page_lru_base_type(page);
-		if (PageActive(page)) {
-			__ClearPageActive(page);
-			lru += LRU_ACTIVE;
-		}
-	}
-	return lru;
+	/* this shouldn't happen, so leave the flags to bad_page() */
+	if (PageActive(page) && PageUnevictable(page))
+		return;
+
+	__ClearPageActive(page);
+	__ClearPageUnevictable(page);
 }
 
 /**
diff --git a/mm/swap.c b/mm/swap.c
index ff910f636df2..1455fa56a0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -84,9 +84,8 @@ static void __page_cache_release(struct page *page)
 
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
-		__ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec);
-		page_off_lru(page);
+		__clear_page_lru_flags(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
@@ -911,9 +910,8 @@ void release_pages(struct page **pages, int nr)
 				lock_batch = 0;
 
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
-			__ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec);
-			page_off_lru(page);
+			__clear_page_lru_flags(page);
 		}
 
 		__ClearPageWaiters(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7c65d47e6612..452dd3818aa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1849,8 +1849,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		SetPageLRU(page);
 
 		if (unlikely(put_page_testzero(page))) {
-			__ClearPageLRU(page);
-			__ClearPageActive(page);
+			__clear_page_lru_flags(page);
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&lruvec->lru_lock);
-- 
cgit v1.2.3


From bc7112719e1e80e4208eef3fc9bd8d2b6c263e7d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:32 -0800
Subject: mm: VM_BUG_ON lru page flags

Move scattered VM_BUG_ONs to two essential places that cover all
lru list additions and deletions.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-8-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-8-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 4 ++++
 mm/swap.c                 | 2 --
 mm/vmscan.c               | 1 -
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ef3fd79222e5..6d907a4dd6ad 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -66,6 +66,8 @@ static inline enum lru_list page_lru_base_type(struct page *page)
  */
 static __always_inline void __clear_page_lru_flags(struct page *page)
 {
+	VM_BUG_ON_PAGE(!PageLRU(page), page);
+
 	__ClearPageLRU(page);
 
 	/* this shouldn't happen, so leave the flags to bad_page() */
@@ -87,6 +89,8 @@ static __always_inline enum lru_list page_lru(struct page *page)
 {
 	enum lru_list lru;
 
+	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+
 	if (PageUnevictable(page))
 		lru = LRU_UNEVICTABLE;
 	else {
diff --git a/mm/swap.c b/mm/swap.c
index 1455fa56a0ee..ab3258afcbeb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -83,7 +83,6 @@ static void __page_cache_release(struct page *page)
 		unsigned long flags;
 
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
-		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		del_page_from_lru_list(page, lruvec);
 		__clear_page_lru_flags(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -909,7 +908,6 @@ void release_pages(struct page **pages, int nr)
 			if (prev_lruvec != lruvec)
 				lock_batch = 0;
 
-			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			del_page_from_lru_list(page, lruvec);
 			__clear_page_lru_flags(page);
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 452dd3818aa3..348a90096550 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4281,7 +4281,6 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
-			VM_BUG_ON_PAGE(PageActive(page), page);
 			del_page_from_lru_list(page, lruvec);
 			ClearPageUnevictable(page);
 			add_page_to_lru_list(page, lruvec);
-- 
cgit v1.2.3


From c1770e34f3e7640887d8129fc05d13fe17101301 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:36 -0800
Subject: include/linux/mm_inline.h: fold page_lru_base_type() into its sole
 caller

We've removed all other references to this function.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-9-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-9-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 6d907a4dd6ad..7183c7a03f09 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -45,21 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 #endif
 }
 
-/**
- * page_lru_base_type - which LRU list type should a page be on?
- * @page: the page to test
- *
- * Used for LRU list index arithmetic.
- *
- * Returns the base LRU type - file or anon - @page should be on.
- */
-static inline enum lru_list page_lru_base_type(struct page *page)
-{
-	if (page_is_file_lru(page))
-		return LRU_INACTIVE_FILE;
-	return LRU_INACTIVE_ANON;
-}
-
 /**
  * __clear_page_lru_flags - clear page lru flags before releasing a page
  * @page: the page that was on lru and now has a zero reference
@@ -92,12 +77,12 @@ static __always_inline enum lru_list page_lru(struct page *page)
 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
 
 	if (PageUnevictable(page))
-		lru = LRU_UNEVICTABLE;
-	else {
-		lru = page_lru_base_type(page);
-		if (PageActive(page))
-			lru += LRU_ACTIVE;
-	}
+		return LRU_UNEVICTABLE;
+
+	lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+	if (PageActive(page))
+		lru += LRU_ACTIVE;
+
 	return lru;
 }
 
-- 
cgit v1.2.3


From 289ccba18af436f2b65ec69b2be1b086ec9f24a4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:40 -0800
Subject: include/linux/mm_inline.h: fold __update_lru_size() into its sole
 caller

All other references to the function were removed after commit
a892cb6b977f ("mm/vmscan.c: use update_lru_size() in update_lru_sizes()").

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-10-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-10-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7183c7a03f09..355ea1ee32bd 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page)
 	return !PageSwapBacked(page);
 }
 
-static __always_inline void __update_lru_size(struct lruvec *lruvec,
+static __always_inline void update_lru_size(struct lruvec *lruvec,
 				enum lru_list lru, enum zone_type zid,
 				int nr_pages)
 {
@@ -33,13 +33,6 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
-}
-
-static __always_inline void update_lru_size(struct lruvec *lruvec,
-				enum lru_list lru, enum zone_type zid,
-				int nr_pages)
-{
-	__update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
 	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
-- 
cgit v1.2.3


From 2091339d59e7808e9b39a79f48e3d17ef7389b97 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:44 -0800
Subject: mm/vmscan.c: make lruvec_lru_size() static

All other references to the function were removed after
commit b910718a948a ("mm: vmscan: detect file thrashing at the reclaim
root").

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-11-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-11-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 mm/vmscan.c            | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc99e9241846..9198b7ade85f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -892,8 +892,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
-
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 int local_memory_node(int node_id);
 #else
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 348a90096550..fea6b43bc1f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  * @lru: lru to use
  * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
  */
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+				     int zone_idx)
 {
 	unsigned long size = 0;
 	int zid;
-- 
cgit v1.2.3


From d6995da311221a05c8aef3bda2629e5cb14c7302 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:08:51 -0800
Subject: hugetlb: use page.private for hugetlb specific page flags

Patch series "create hugetlb flags to consolidate state", v3.

While discussing a series of hugetlb fixes in [1], it became evident that
the hugetlb specific page state information is stored in a somewhat
haphazard manner.  Code dealing with state information would be easier to
read, understand and maintain if this information was stored in a
consistent manner.

This series uses page.private of the hugetlb head page for storing a set
of hugetlb specific page flags.  Routines are priovided for test, set and
clear of the flags.

[1] https://lore.kernel.org/r/20210106084739.63318-1-songmuchun@bytedance.com

This patch (of 4):

As hugetlbfs evolved, state information about hugetlb pages was added.
One 'convenient' way of doing this was to use available fields in tail
pages.  Over time, it has become difficult to know the meaning or contents
of fields simply by looking at a small bit of code.  Sometimes, the naming
is just confusing.  For example: The PagePrivate flag indicates a huge
page reservation was consumed and needs to be restored if an error is
encountered and the page is freed before it is instantiated.  The
page.private field contains the pointer to a subpool if the page is
associated with one.

In an effort to make the code more readable, use page.private to contain
hugetlb specific page flags.  These flags will have test, set and clear
functions similar to those used for 'normal' page flags.  More
importantly, an enum of flag values will be created with names that
actually reflect their purpose.

In this patch,
- Create infrastructure for hugetlb specific page flag functions
- Move subpool pointer to page[1].private to make way for flags
  Create routines with meaningful names to modify subpool field
- Use new HPageRestoreReserve flag instead of PagePrivate

Conversion of other state information will happen in subsequent patches.

Link: https://lkml.kernel.org/r/20210122195231.324857-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20210122195231.324857-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 12 +++------
 include/linux/hugetlb.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/hugetlb.c            | 48 +++++++++++++++++-----------------
 3 files changed, 96 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b7a72f577aab..907f6405e805 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -973,15 +973,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
-	/*
-	 * page_private is subpool pointer in hugetlb pages.  Transfer to
-	 * new page.  PagePrivate is not associated with page_private for
-	 * hugetlb pages and can not be set here as only page_huge_active
-	 * pages can be migrated.
-	 */
-	if (page_private(page)) {
-		set_page_private(newpage, page_private(page));
-		set_page_private(page, 0);
+	if (hugetlb_page_subpool(page)) {
+		hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
+		hugetlb_set_page_subpool(page, NULL);
 	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ac3cb239a7ec..249c65e1d8ca 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -472,6 +472,60 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
+/*
+ * huegtlb page specific state flags.  These flags are located in page.private
+ * of the hugetlb head page.  Functions created via the below macros should be
+ * used to manipulate these flags.
+ *
+ * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
+ *	allocation time.  Cleared when page is fully instantiated.  Free
+ *	routine checks flag to restore a reservation on error paths.
+ */
+enum hugetlb_page_flags {
+	HPG_restore_reserve = 0,
+	__NR_HPAGEFLAGS,
+};
+
+/*
+ * Macros to create test, set and clear function definitions for
+ * hugetlb specific page flags.
+ */
+#ifdef CONFIG_HUGETLB_PAGE
+#define TESTHPAGEFLAG(uname, flname)				\
+static inline int HPage##uname(struct page *page)		\
+	{ return test_bit(HPG_##flname, &(page->private)); }
+
+#define SETHPAGEFLAG(uname, flname)				\
+static inline void SetHPage##uname(struct page *page)		\
+	{ set_bit(HPG_##flname, &(page->private)); }
+
+#define CLEARHPAGEFLAG(uname, flname)				\
+static inline void ClearHPage##uname(struct page *page)		\
+	{ clear_bit(HPG_##flname, &(page->private)); }
+#else
+#define TESTHPAGEFLAG(uname, flname)				\
+static inline int HPage##uname(struct page *page)		\
+	{ return 0; }
+
+#define SETHPAGEFLAG(uname, flname)				\
+static inline void SetHPage##uname(struct page *page)		\
+	{ }
+
+#define CLEARHPAGEFLAG(uname, flname)				\
+static inline void ClearHPage##uname(struct page *page)		\
+	{ }
+#endif
+
+#define HPAGEFLAG(uname, flname)				\
+	TESTHPAGEFLAG(uname, flname)				\
+	SETHPAGEFLAG(uname, flname)				\
+	CLEARHPAGEFLAG(uname, flname)				\
+
+/*
+ * Create functions associated with hugetlb page flags
+ */
+HPAGEFLAG(RestoreReserve, restore_reserve)
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #define HSTATE_NAME_LEN 32
@@ -531,6 +585,20 @@ extern unsigned int default_hstate_idx;
 
 #define default_hstate (hstates[default_hstate_idx])
 
+/*
+ * hugetlb page subpool pointer located in hpage[1].private
+ */
+static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
+{
+	return (struct hugepage_subpool *)(hpage+1)->private;
+}
+
+static inline void hugetlb_set_page_subpool(struct page *hpage,
+					struct hugepage_subpool *subpool)
+{
+	set_page_private(hpage+1, (unsigned long)subpool);
+}
+
 static inline struct hstate *hstate_file(struct file *f)
 {
 	return hstate_inode(file_inode(f));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8a1d1f85e21e..f5e85dabb7a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1143,7 +1143,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
-		SetPagePrivate(page);
+		SetHPageRestoreReserve(page);
 		h->resv_huge_pages--;
 	}
 
@@ -1418,20 +1418,19 @@ static void __free_huge_page(struct page *page)
 	 */
 	struct hstate *h = page_hstate(page);
 	int nid = page_to_nid(page);
-	struct hugepage_subpool *spool =
-		(struct hugepage_subpool *)page_private(page);
+	struct hugepage_subpool *spool = hugetlb_page_subpool(page);
 	bool restore_reserve;
 
 	VM_BUG_ON_PAGE(page_count(page), page);
 	VM_BUG_ON_PAGE(page_mapcount(page), page);
 
-	set_page_private(page, 0);
+	hugetlb_set_page_subpool(page, NULL);
 	page->mapping = NULL;
-	restore_reserve = PagePrivate(page);
-	ClearPagePrivate(page);
+	restore_reserve = HPageRestoreReserve(page);
+	ClearHPageRestoreReserve(page);
 
 	/*
-	 * If PagePrivate() was set on page, page allocation consumed a
+	 * If HPageRestoreReserve was set on page, page allocation consumed a
 	 * reservation.  If the page was associated with a subpool, there
 	 * would have been a page reserved in the subpool before allocation
 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
@@ -2263,24 +2262,24 @@ static long vma_add_reservation(struct hstate *h,
  * This routine is called to restore a reservation on error paths.  In the
  * specific error paths, a huge page was allocated (via alloc_huge_page)
  * and is about to be freed.  If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set PagePrivate
- * in the newly allocated page.  When the page is freed via free_huge_page,
- * the global reservation count will be incremented if PagePrivate is set.
- * However, free_huge_page can not adjust the reserve map.  Adjust the
- * reserve map here to be consistent with global reserve count adjustments
- * to be made by free_huge_page.
+ * alloc_huge_page would have consumed the reservation and set
+ * HPageRestoreReserve in the newly allocated page.  When the page is freed
+ * via free_huge_page, the global reservation count will be incremented if
+ * HPageRestoreReserve is set.  However, free_huge_page can not adjust the
+ * reserve map.  Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page.
  */
 static void restore_reserve_on_error(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address,
 			struct page *page)
 {
-	if (unlikely(PagePrivate(page))) {
+	if (unlikely(HPageRestoreReserve(page))) {
 		long rc = vma_needs_reservation(h, vma, address);
 
 		if (unlikely(rc < 0)) {
 			/*
 			 * Rare out of memory condition in reserve map
-			 * manipulation.  Clear PagePrivate so that
+			 * manipulation.  Clear HPageRestoreReserve so that
 			 * global reserve count will not be incremented
 			 * by free_huge_page.  This will make it appear
 			 * as though the reservation for this page was
@@ -2289,7 +2288,7 @@ static void restore_reserve_on_error(struct hstate *h,
 			 * is better than inconsistent global huge page
 			 * accounting of reserve counts.
 			 */
-			ClearPagePrivate(page);
+			ClearHPageRestoreReserve(page);
 		} else if (rc) {
 			rc = vma_add_reservation(h, vma, address);
 			if (unlikely(rc < 0))
@@ -2297,7 +2296,7 @@ static void restore_reserve_on_error(struct hstate *h,
 				 * See above comment about rare out of
 				 * memory condition.
 				 */
-				ClearPagePrivate(page);
+				ClearHPageRestoreReserve(page);
 		} else
 			vma_end_reservation(h, vma, address);
 	}
@@ -2378,7 +2377,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 		if (!page)
 			goto out_uncharge_cgroup;
 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
-			SetPagePrivate(page);
+			SetHPageRestoreReserve(page);
 			h->resv_huge_pages--;
 		}
 		spin_lock(&hugetlb_lock);
@@ -2396,7 +2395,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
 	spin_unlock(&hugetlb_lock);
 
-	set_page_private(page, (unsigned long)spool);
+	hugetlb_set_page_subpool(page, spool);
 
 	map_commit = vma_commit_reservation(h, vma, addr);
 	if (unlikely(map_chg > map_commit)) {
@@ -3170,6 +3169,9 @@ static int __init hugetlb_init(void)
 {
 	int i;
 
+	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
+			__NR_HPAGEFLAGS);
+
 	if (!hugepages_supported()) {
 		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
 			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
@@ -4207,7 +4209,7 @@ retry_avoidcopy:
 	spin_lock(ptl);
 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
-		ClearPagePrivate(new_page);
+		ClearHPageRestoreReserve(new_page);
 
 		/* Break COW */
 		huge_ptep_clear_flush(vma, haddr, ptep);
@@ -4274,7 +4276,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 
 	if (err)
 		return err;
-	ClearPagePrivate(page);
+	ClearHPageRestoreReserve(page);
 
 	/*
 	 * set page dirty so that it will not be removed from cache/file
@@ -4436,7 +4438,7 @@ retry:
 		goto backout;
 
 	if (anon_rmap) {
-		ClearPagePrivate(page);
+		ClearHPageRestoreReserve(page);
 		hugepage_add_new_anon_rmap(page, vma, haddr);
 	} else
 		page_dup_rmap(page, true);
@@ -4750,7 +4752,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (vm_shared) {
 		page_dup_rmap(page, true);
 	} else {
-		ClearPagePrivate(page);
+		ClearHPageRestoreReserve(page);
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-- 
cgit v1.2.3


From 8f251a3d5ce3bdea73bd045ed35db64f32e0d0d9 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:08:56 -0800
Subject: hugetlb: convert page_huge_active() HPageMigratable flag

Use the new hugetlb page specific flag HPageMigratable to replace the
page_huge_active interfaces.  By it's name, page_huge_active implied that
a huge page was on the active list.  However, that is not really what code
checking the flag wanted to know.  It really wanted to determine if the
huge page could be migrated.  This happens when the page is actually added
to the page cache and/or task page table.  This is the reasoning behind
the name change.

The VM_BUG_ON_PAGE() calls in the *_huge_active() interfaces are not
really necessary as we KNOW the page is a hugetlb page.  Therefore, they
are removed.

The routine page_huge_active checked for PageHeadHuge before testing the
active bit.  This is unnecessary in the case where we hold a reference or
lock and know it is a hugetlb head page.  page_huge_active is also called
without holding a reference or lock (scan_movable_pages), and can race
with code freeing the page.  The extra check in page_huge_active shortened
the race window, but did not prevent the race.  Offline code calling
scan_movable_pages already deals with these races, so removing the check
is acceptable.  Add comment to racy code.

[songmuchun@bytedance.com: remove set_page_huge_active() declaration from include/linux/hugetlb.h]
  Link: https://lkml.kernel.org/r/CAMZfGtUda+KoAZscU0718TN61cSFwp4zy=y2oZ=+6Z2TAZZwng@mail.gmail.com

Link: https://lkml.kernel.org/r/20210122195231.324857-3-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c       |  2 +-
 include/linux/hugetlb.h    |  7 +++++--
 include/linux/page-flags.h |  6 ------
 mm/hugetlb.c               | 45 +++++++++++----------------------------------
 mm/memory_hotplug.c        |  9 ++++++++-
 5 files changed, 25 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 907f6405e805..5a8ed6bd4f87 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -735,7 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
-		set_page_huge_active(page);
+		SetHPageMigratable(page);
 		/*
 		 * unlock_page because locked by add_to_page_cache()
 		 * put_page() due to reference from alloc_huge_page()
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 249c65e1d8ca..77f2a032fe32 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -480,9 +480,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
  *	allocation time.  Cleared when page is fully instantiated.  Free
  *	routine checks flag to restore a reservation on error paths.
+ * HPG_migratable  - Set after a newly allocated page is added to the page
+ *	cache and/or page tables.  Indicates the page is a candidate for
+ *	migration.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
+	HPG_migratable,
 	__NR_HPAGEFLAGS,
 };
 
@@ -525,6 +529,7 @@ static inline void ClearHPage##uname(struct page *page)		\
  * Create functions associated with hugetlb page flags
  */
 HPAGEFLAG(RestoreReserve, restore_reserve)
+HPAGEFLAG(Migratable, migratable)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -838,8 +843,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
-void set_page_huge_active(struct page *page);
-
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec5d0290e0ee..db914477057b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -592,15 +592,9 @@ static inline void ClearPageCompound(struct page *page)
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
 int PageHeadHuge(struct page *page);
-bool page_huge_active(struct page *page);
 #else
 TESTPAGEFLAG_FALSE(Huge)
 TESTPAGEFLAG_FALSE(HeadHuge)
-
-static inline bool page_huge_active(struct page *page)
-{
-	return 0;
-}
 #endif
 
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f5e85dabb7a3..727c09713627 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1364,30 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
-/*
- * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
- * to hstate->hugepage_activelist.)
- *
- * This function can be called for tail pages, but never returns true for them.
- */
-bool page_huge_active(struct page *page)
-{
-	return PageHeadHuge(page) && PagePrivate(&page[1]);
-}
-
-/* never called for tail page */
-void set_page_huge_active(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
-	SetPagePrivate(&page[1]);
-}
-
-static void clear_page_huge_active(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
-	ClearPagePrivate(&page[1]);
-}
-
 /*
  * Internal hugetlb specific page flag. Do not use outside of the hugetlb
  * code
@@ -1449,7 +1425,7 @@ static void __free_huge_page(struct page *page)
 	}
 
 	spin_lock(&hugetlb_lock);
-	clear_page_huge_active(page);
+	ClearHPageMigratable(page);
 	hugetlb_cgroup_uncharge_page(hstate_index(h),
 				     pages_per_huge_page(h), page);
 	hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
@@ -4218,7 +4194,7 @@ retry_avoidcopy:
 				make_huge_pte(vma, new_page, 1));
 		page_remove_rmap(old_page, true);
 		hugepage_add_new_anon_rmap(new_page, vma, haddr);
-		set_page_huge_active(new_page);
+		SetHPageMigratable(new_page);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -4455,12 +4431,12 @@ retry:
 	spin_unlock(ptl);
 
 	/*
-	 * Only make newly allocated pages active.  Existing pages found
-	 * in the pagecache could be !page_huge_active() if they have been
-	 * isolated for migration.
+	 * Only set HPageMigratable in newly allocated pages.  Existing pages
+	 * found in the pagecache may not have HPageMigratableset if they have
+	 * been isolated for migration.
 	 */
 	if (new_page)
-		set_page_huge_active(page);
+		SetHPageMigratable(page);
 
 	unlock_page(page);
 out:
@@ -4771,7 +4747,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
 	spin_unlock(ptl);
-	set_page_huge_active(page);
+	SetHPageMigratable(page);
 	if (vm_shared)
 		unlock_page(page);
 	ret = 0;
@@ -5610,12 +5586,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 	bool ret = true;
 
 	spin_lock(&hugetlb_lock);
-	if (!PageHeadHuge(page) || !page_huge_active(page) ||
+	if (!PageHeadHuge(page) ||
+	    !HPageMigratable(page) ||
 	    !get_page_unless_zero(page)) {
 		ret = false;
 		goto unlock;
 	}
-	clear_page_huge_active(page);
+	ClearHPageMigratable(page);
 	list_move_tail(&page->lru, list);
 unlock:
 	spin_unlock(&hugetlb_lock);
@@ -5625,7 +5602,7 @@ unlock:
 void putback_active_hugepage(struct page *page)
 {
 	spin_lock(&hugetlb_lock);
-	set_page_huge_active(page);
+	SetHPageMigratable(page);
 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
 	spin_unlock(&hugetlb_lock);
 	put_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ddcb1cd24c60..abe43c1ae920 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
 		if (!PageHuge(page))
 			continue;
 		head = compound_head(page);
-		if (page_huge_active(head))
+		/*
+		 * This test is racy as we hold no reference or lock.  The
+		 * hugetlb page could have been free'ed and head is no longer
+		 * a hugetlb page before the following check.  In such unlikely
+		 * cases false positives and negatives are possible.  Calling
+		 * code must deal with these scenarios.
+		 */
+		if (HPageMigratable(head))
 			goto found;
 		skip = compound_nr(head) - (page - head);
 		pfn += skip - 1;
-- 
cgit v1.2.3


From 9157c31186c358c5750dea50ac5705d61d7fc917 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:00 -0800
Subject: hugetlb: convert PageHugeTemporary() to HPageTemporary flag

Use new hugetlb specific HPageTemporary flag to replace the
PageHugeTemporary() interfaces.  PageHugeTemporary does contain a
PageHuge() check.  However, this interface is only used within hugetlb
code where we know we are dealing with a hugetlb page.  Therefore, the
check can be eliminated.

Link: https://lkml.kernel.org/r/20210122195231.324857-5-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 ++++++
 mm/hugetlb.c            | 36 +++++++-----------------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77f2a032fe32..e97498a21010 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -483,10 +483,15 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_migratable  - Set after a newly allocated page is added to the page
  *	cache and/or page tables.  Indicates the page is a candidate for
  *	migration.
+ * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
+ *	allocator.  Typically used for migration target pages when no pages
+ *	are available in the pool.  The hugetlb free page path will
+ *	immediately free pages with this flag set to the buddy allocator.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
 	HPG_migratable,
+	HPG_temporary,
 	__NR_HPAGEFLAGS,
 };
 
@@ -530,6 +535,7 @@ static inline void ClearHPage##uname(struct page *page)		\
  */
 HPAGEFLAG(RestoreReserve, restore_reserve)
 HPAGEFLAG(Migratable, migratable)
+HPAGEFLAG(Temporary, temporary)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 727c09713627..4d7d4535a313 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1364,28 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
-/*
- * Internal hugetlb specific page flag. Do not use outside of the hugetlb
- * code
- */
-static inline bool PageHugeTemporary(struct page *page)
-{
-	if (!PageHuge(page))
-		return false;
-
-	return (unsigned long)page[2].mapping == -1U;
-}
-
-static inline void SetPageHugeTemporary(struct page *page)
-{
-	page[2].mapping = (void *)-1U;
-}
-
-static inline void ClearPageHugeTemporary(struct page *page)
-{
-	page[2].mapping = NULL;
-}
-
 static void __free_huge_page(struct page *page)
 {
 	/*
@@ -1433,9 +1411,9 @@ static void __free_huge_page(struct page *page)
 	if (restore_reserve)
 		h->resv_huge_pages++;
 
-	if (PageHugeTemporary(page)) {
+	if (HPageTemporary(page)) {
 		list_del(&page->lru);
-		ClearPageHugeTemporary(page);
+		ClearHPageTemporary(page);
 		update_and_free_page(h, page);
 	} else if (h->surplus_huge_pages_node[nid]) {
 		/* remove the page from active list */
@@ -1869,7 +1847,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 * codeflow
 	 */
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
-		SetPageHugeTemporary(page);
+		SetHPageTemporary(page);
 		spin_unlock(&hugetlb_lock);
 		put_page(page);
 		return NULL;
@@ -1900,7 +1878,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 * We do not account these pages as surplus because they are only
 	 * temporary and will be released properly on the last reference
 	 */
-	SetPageHugeTemporary(page);
+	SetHPageTemporary(page);
 
 	return page;
 }
@@ -5625,12 +5603,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 	 * here as well otherwise the global surplus count will not match
 	 * the per-node's.
 	 */
-	if (PageHugeTemporary(newpage)) {
+	if (HPageTemporary(newpage)) {
 		int old_nid = page_to_nid(oldpage);
 		int new_nid = page_to_nid(newpage);
 
-		SetPageHugeTemporary(oldpage);
-		ClearPageHugeTemporary(newpage);
+		SetHPageTemporary(oldpage);
+		ClearHPageTemporary(newpage);
 
 		spin_lock(&hugetlb_lock);
 		if (h->surplus_huge_pages_node[old_nid]) {
-- 
cgit v1.2.3


From 6c037149014027d50175da5be4ae4531374dcbe0 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:04 -0800
Subject: hugetlb: convert PageHugeFreed to HPageFreed flag

Use new hugetlb specific HPageFreed flag to replace the PageHugeFreed
interfaces.

Link: https://lkml.kernel.org/r/20210122195231.324857-6-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            | 23 ++++-------------------
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e97498a21010..eb2682fd97b6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -487,11 +487,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
+ * HPG_freed - Set when page is on the free lists.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
 	HPG_migratable,
 	HPG_temporary,
+	HPG_freed,
 	__NR_HPAGEFLAGS,
 };
 
@@ -536,6 +538,7 @@ static inline void ClearHPage##uname(struct page *page)		\
 HPAGEFLAG(RestoreReserve, restore_reserve)
 HPAGEFLAG(Migratable, migratable)
 HPAGEFLAG(Temporary, temporary)
+HPAGEFLAG(Freed, freed)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4d7d4535a313..5377e1dad044 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,21 +79,6 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
-static inline bool PageHugeFreed(struct page *head)
-{
-	return page_private(head + 4) == -1UL;
-}
-
-static inline void SetPageHugeFreed(struct page *head)
-{
-	set_page_private(head + 4, -1UL);
-}
-
-static inline void ClearPageHugeFreed(struct page *head)
-{
-	set_page_private(head + 4, 0);
-}
-
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 
@@ -1053,7 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 	list_move(&page->lru, &h->hugepage_freelists[nid]);
 	h->free_huge_pages++;
 	h->free_huge_pages_node[nid]++;
-	SetPageHugeFreed(page);
+	SetHPageFreed(page);
 }
 
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1070,7 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 		list_move(&page->lru, &h->hugepage_activelist);
 		set_page_refcounted(page);
-		ClearPageHugeFreed(page);
+		ClearHPageFreed(page);
 		h->free_huge_pages--;
 		h->free_huge_pages_node[nid]--;
 		return page;
@@ -1485,7 +1470,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	spin_lock(&hugetlb_lock);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
-	ClearPageHugeFreed(page);
+	ClearHPageFreed(page);
 	spin_unlock(&hugetlb_lock);
 }
 
@@ -1756,7 +1741,7 @@ retry:
 		 * We should make sure that the page is already on the free list
 		 * when it is dissolved.
 		 */
-		if (unlikely(!PageHugeFreed(head))) {
+		if (unlikely(!HPageFreed(head))) {
 			spin_unlock(&hugetlb_lock);
 			cond_resched();
 
-- 
cgit v1.2.3


From d95c0337774b1dc74d271e7475a96fe8838332ea Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:08 -0800
Subject: include/linux/hugetlb.h: add synchronization information for new
 hugetlb specific flags

Add comments, no functional change.

Link: https://lkml.kernel.org/r/62a80585-2a73-10cc-4a2d-5721540d4ad2@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index eb2682fd97b6..3a541c6cf5c3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -480,14 +480,24 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
  *	allocation time.  Cleared when page is fully instantiated.  Free
  *	routine checks flag to restore a reservation on error paths.
+ *	Synchronization:  Examined or modified by code that knows it has
+ *	the only reference to page.  i.e. After allocation but before use
+ *	or when the page is being freed.
  * HPG_migratable  - Set after a newly allocated page is added to the page
  *	cache and/or page tables.  Indicates the page is a candidate for
  *	migration.
+ *	Synchronization:  Initially set after new page allocation with no
+ *	locking.  When examined and modified during migration processing
+ *	(isolate, migrate, putback) the hugetlb_lock is held.
  * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
+ *	Synchronization: Can be set after huge page allocation from buddy when
+ *	code knows it has only reference.  All other examinations and
+ *	modifications require hugetlb_lock.
  * HPG_freed - Set when page is on the free lists.
+ *	Synchronization: hugetlb_lock held for examination and modification.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
-- 
cgit v1.2.3


From bda420b985054a3badafef23807c4b4fa38a3dff Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 24 Feb 2021 12:09:43 -0800
Subject: numa balancing: migrate on fault among multiple bound nodes

Now, NUMA balancing can only optimize the page placement among the NUMA
nodes if the default memory policy is used.  Because the memory policy
specified explicitly should take precedence.  But this seems too strict in
some situations.  For example, on a system with 4 NUMA nodes, if the
memory of an application is bound to the node 0 and 1, NUMA balancing can
potentially migrate the pages between the node 0 and 1 to reduce
cross-node accessing without breaking the explicit memory binding policy.

So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to
set_mempolicy() when mode is MPOL_BIND.  With the flag specified, NUMA
balancing will be enabled within the thread to optimize the page placement
within the constrains of the specified memory binding policy.  With the
newly added flag, the NUMA balancing control mechanism becomes,

 - sysctl knob numa_balancing can enable/disable the NUMA balancing
   globally.

 - even if sysctl numa_balancing is enabled, the NUMA balancing will be
   disabled for the memory areas or applications with the explicit
   memory policy by default.

 - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for
   the applications when specifying the explicit memory policy
   (MPOL_BIND).

Various page placement optimization based on the NUMA balancing can be
done with these flags.  As the first step, in this patch, if the memory of
the application is bound to multiple nodes (MPOL_BIND), and in the hint
page fault handler the accessing node are in the policy nodemask, the page
will be tried to be migrated to the accessing node to reduce the
cross-node accessing.

If the newly added MPOL_F_NUMA_BALANCING flag is specified by an
application on an old kernel version without its support, set_mempolicy()
will return -1 and errno will be set to EINVAL.  The application can use
this behavior to run on both old and new kernel versions.

And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than
MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL
as before.  Because we don't support optimization based on the NUMA
balancing for these modes.

In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for
mbind().  But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good
API/ABI for the purpose of the patch.

And because it's not clear whether it's necessary to enable NUMA balancing
for a specific memory area inside an application, so we only add the flag
at the thread level (set_mempolicy()) instead of the memory area level
(mbind()).  We can do that when it become necessary.

To test the patch, we run a test case as follows on a 4-node machine with
192 GB memory (48 GB per node).

1. Change pmbench memory accessing benchmark to call set_mempolicy()
   to bind its memory to node 1 and 3 and enable NUMA balancing.  Some
   related code snippets are as follows,

     #include <numaif.h>
     #include <numa.h>

	struct bitmask *bmp;
	int ret;

	bmp = numa_parse_nodestring("1,3");
	ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING,
			    bmp->maskp, bmp->size + 1);
	/* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */
	if (ret < 0 && errno == EINVAL)
		ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1);
	if (ret < 0) {
		perror("Failed to call set_mempolicy");
		exit(-1);
	}

2. Run a memory eater on node 3 to use 40 GB memory before running pmbench.

3. Run pmbench with 64 processes, the working-set size of each process
   is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB.  The
   CPU and the memory (as in step 1.) of all pmbench processes is bound
   to node 1 and 3. So, after CPU usage is balanced, some pmbench
   processes run on the CPUs of the node 3 will access the memory of
   the node 1.

4. After the pmbench processes run for 100 seconds, kill the memory
   eater.  Now it's possible for some pmbench processes to migrate
   their pages from node 1 to node 3 to reduce cross-node accessing.

Test results show that, with the patch, the pages can be migrated from
node 1 to node 3 after killing the memory eater, and the pmbench score
can increase about 17.5%.

Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/mempolicy.h |  4 +++-
 mm/mempolicy.c                 | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 3354774af61e..8948467b3992 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -28,12 +28,14 @@ enum {
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
+#define MPOL_F_NUMA_BALANCING	(1 << 13) /* Optimize with NUMA balancing if possible */
 
 /*
  * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
  * either set_mempolicy() or mbind().
  */
-#define MPOL_MODE_FLAGS	(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+#define MPOL_MODE_FLAGS							\
+	(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING)
 
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2c3a86502053..6961238c7ef5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 		goto out;
 	}
 
+	if (flags & MPOL_F_NUMA_BALANCING) {
+		if (new && new->mode == MPOL_BIND) {
+			new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
+		} else {
+			ret = -EINVAL;
+			mpol_put(new);
+			goto out;
+		}
+	}
+
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
 		mpol_put(new);
@@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_BIND:
+		/* Optimize placement among multiple nodes via NUMA balancing */
+		if (pol->flags & MPOL_F_MORON) {
+			if (node_isset(thisnid, pol->v.nodes))
+				break;
+			goto out;
+		}
 
 		/*
 		 * allows binding to multiple nodes.
-- 
cgit v1.2.3


From 33b8f84a4ee78491a8f4f9e4c5520c9da4a10983 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:54 -0800
Subject: mm/hugetlb: change hugetlb_reserve_pages() to type bool

While reviewing a bug in hugetlb_reserve_pages, it was noticed that all
callers ignore the return value.  Any failure is considered an ENOMEM
error by the callers.

Change the function to be of type bool.  The function will return true if
the reservation was successful, false otherwise.  Callers currently assume
a zero return code indicates success.  Change the callers to look for true
to indicate success.  No functional change, only code cleanup.

Link: https://lkml.kernel.org/r/20201221192542.15732-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  4 ++--
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            | 37 ++++++++++++++-----------------------
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5a8ed6bd4f87..3eca85a4d940 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	file_accessed(file);
 
 	ret = -ENOMEM;
-	if (hugetlb_reserve_pages(inode,
+	if (!hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
 				vma->vm_flags))
@@ -1493,7 +1493,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	inode->i_size = size;
 	clear_nlink(inode);
 
-	if (hugetlb_reserve_pages(inode, 0,
+	if (!hugetlb_reserve_pages(inode, 0,
 			size >> huge_page_shift(hstate_inode(inode)), NULL,
 			acctflag))
 		file = ERR_PTR(-ENOMEM);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a541c6cf5c3..cccd1aab69dd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -139,7 +139,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				struct page **pagep);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8e5f494291c6..8fb42c6dd74b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5016,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	return pages << h->order;
 }
 
-int hugetlb_reserve_pages(struct inode *inode,
+/* Return true if reservation was successful, false otherwise.  */
+bool hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
 					vm_flags_t vm_flags)
 {
-	long ret, chg, add = -1;
+	long chg, add = -1;
 	struct hstate *h = hstate_inode(inode);
 	struct hugepage_subpool *spool = subpool_inode(inode);
 	struct resv_map *resv_map;
@@ -5031,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	/* This should never happen */
 	if (from > to) {
 		VM_WARN(1, "%s called with a negative range\n", __func__);
-		return -EINVAL;
+		return false;
 	}
 
 	/*
@@ -5040,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * without using reserves
 	 */
 	if (vm_flags & VM_NORESERVE)
-		return 0;
+		return true;
 
 	/*
 	 * Shared mappings base their reservation on the number of pages that
@@ -5062,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 		/* Private mapping. */
 		resv_map = resv_map_alloc();
 		if (!resv_map)
-			return -ENOMEM;
+			return false;
 
 		chg = to - from;
 
@@ -5070,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
-	if (chg < 0) {
-		ret = chg;
+	if (chg < 0)
 		goto out_err;
-	}
 
-	ret = hugetlb_cgroup_charge_cgroup_rsvd(
-		hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
-
-	if (ret < 0) {
-		ret = -ENOMEM;
+	if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
+				chg * pages_per_huge_page(h), &h_cg) < 0)
 		goto out_err;
-	}
 
 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
@@ -5096,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * reservations already in place (gbl_reserve).
 	 */
 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
-	if (gbl_reserve < 0) {
-		ret = -ENOSPC;
+	if (gbl_reserve < 0)
 		goto out_uncharge_cgroup;
-	}
 
 	/*
 	 * Check enough hugepages are available for the reservation.
 	 * Hand the pages back to the subpool if there are not
 	 */
-	ret = hugetlb_acct_memory(h, gbl_reserve);
-	if (ret < 0) {
+	if (hugetlb_acct_memory(h, gbl_reserve) < 0)
 		goto out_put_pages;
-	}
 
 	/*
 	 * Account for the reservations made. Shared mappings record regions
@@ -5126,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 		if (unlikely(add < 0)) {
 			hugetlb_acct_memory(h, -gbl_reserve);
-			ret = add;
 			goto out_put_pages;
 		} else if (unlikely(chg > add)) {
 			/*
@@ -5147,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode,
 			hugetlb_acct_memory(h, -rsv_adjust);
 		}
 	}
-	return 0;
+	return true;
+
 out_put_pages:
 	/* put back original number of pages, chg */
 	(void)hugepage_subpool_put_pages(spool, chg);
@@ -5163,7 +5154,7 @@ out_err:
 			region_abort(resv_map, from, to, regions_needed);
 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		kref_put(&resv_map->refs, resv_map_release);
-	return ret;
+	return false;
 }
 
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
-- 
cgit v1.2.3


From a553e3cd2053501b658feec2be9a3b662eb1b22b Mon Sep 17 00:00:00 2001
From: Chengyang Fan <cy.fan@huawei.com>
Date: Wed, 24 Feb 2021 12:10:28 -0800
Subject: mm/migrate: remove unneeded semicolons

Remove superfluous semicolons after function definitions.

Link: https://lkml.kernel.org/r/20210115110131.2359683-1-cy.fan@huawei.com
Signed-off-by: Chengyang Fan <cy.fan@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838a0f7c..3a389633b68f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -89,7 +89,7 @@ extern int PageMovable(struct page *page);
 extern void __SetPageMovable(struct page *page, struct address_space *mapping);
 extern void __ClearPageMovable(struct page *page);
 #else
-static inline int PageMovable(struct page *page) { return 0; };
+static inline int PageMovable(struct page *page) { return 0; }
 static inline void __SetPageMovable(struct page *page,
 				struct address_space *mapping)
 {
-- 
cgit v1.2.3


From 4e096a18867a5a989b510f6999d9c6b6622e8f7b Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 23 Feb 2021 08:01:26 +0100
Subject: net: introduce CAN specific pointer in the struct net_device

Since 20dd3850bcf8 ("can: Speed up CAN frame receiption by using
ml_priv") the CAN framework uses per device specific data in the AF_CAN
protocol. For this purpose the struct net_device->ml_priv is used. Later
the ml_priv usage in CAN was extended for other users, one of them being
CAN_J1939.

Later in the kernel ml_priv was converted to an union, used by other
drivers. E.g. the tun driver started storing it's stats pointer.

Since tun devices can claim to be a CAN device, CAN specific protocols
will wrongly interpret this pointer, which will cause system crashes.
Mostly this issue is visible in the CAN_J1939 stack.

To fix this issue, we request a dedicated CAN pointer within the
net_device struct.

Reported-by: syzbot+5138c4dd15a0401bec7b@syzkaller.appspotmail.com
Fixes: 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv")
Fixes: ffd956eef69b ("can: introduce CAN midlayer private and allocate it automatically")
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Fixes: 497a5757ce4e ("tun: switch to net core provided statistics counters")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20210223070127.4538-1-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/can/dev/dev.c  |  4 +++-
 drivers/net/can/slcan.c    |  4 +++-
 drivers/net/can/vcan.c     |  2 +-
 drivers/net/can/vxcan.c    |  6 +++++-
 include/linux/can/can-ml.h | 12 ++++++++++++
 include/linux/netdevice.h  | 34 +++++++++++++++++++++++++++++++++-
 net/can/af_can.c           | 34 ++--------------------------------
 net/can/j1939/main.c       | 22 ++++++++--------------
 net/can/j1939/socket.c     | 13 ++++---------
 net/can/proc.c             | 19 +++++++++++++------
 10 files changed, 84 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index d9281ae853f8..311d8564d611 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -239,6 +239,7 @@ void can_setup(struct net_device *dev)
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs)
 {
+	struct can_ml_priv *can_ml;
 	struct net_device *dev;
 	struct can_priv *priv;
 	int size;
@@ -270,7 +271,8 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 	priv = netdev_priv(dev);
 	priv->dev = dev;
 
-	dev->ml_priv = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN);
+	can_ml = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 
 	if (echo_skb_max) {
 		priv->echo_skb_max = echo_skb_max;
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index a1bd1be09548..30c8d53c9745 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -516,6 +516,7 @@ static struct slcan *slc_alloc(void)
 	int i;
 	char name[IFNAMSIZ];
 	struct net_device *dev = NULL;
+	struct can_ml_priv *can_ml;
 	struct slcan       *sl;
 	int size;
 
@@ -538,7 +539,8 @@ static struct slcan *slc_alloc(void)
 
 	dev->base_addr  = i;
 	sl = netdev_priv(dev);
-	dev->ml_priv = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN);
+	can_ml = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 
 	/* Initialize channel control data */
 	sl->magic = SLCAN_MAGIC;
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 39ca14b0585d..067705e2850b 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -153,7 +153,7 @@ static void vcan_setup(struct net_device *dev)
 	dev->addr_len		= 0;
 	dev->tx_queue_len	= 0;
 	dev->flags		= IFF_NOARP;
-	dev->ml_priv		= netdev_priv(dev);
+	can_set_ml_priv(dev, netdev_priv(dev));
 
 	/* set flags according to driver capabilities */
 	if (echo)
diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index f9a524c5f6d6..8861a7d875e7 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -141,6 +141,8 @@ static const struct net_device_ops vxcan_netdev_ops = {
 
 static void vxcan_setup(struct net_device *dev)
 {
+	struct can_ml_priv *can_ml;
+
 	dev->type		= ARPHRD_CAN;
 	dev->mtu		= CANFD_MTU;
 	dev->hard_header_len	= 0;
@@ -149,7 +151,9 @@ static void vxcan_setup(struct net_device *dev)
 	dev->flags		= (IFF_NOARP|IFF_ECHO);
 	dev->netdev_ops		= &vxcan_netdev_ops;
 	dev->needs_free_netdev	= true;
-	dev->ml_priv		= netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN);
+
+	can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 }
 
 /* forward declaration for rtnl_create_link() */
diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h
index 2f5d731ae251..8afa92d15a66 100644
--- a/include/linux/can/can-ml.h
+++ b/include/linux/can/can-ml.h
@@ -44,6 +44,7 @@
 
 #include <linux/can.h>
 #include <linux/list.h>
+#include <linux/netdevice.h>
 
 #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
 #define CAN_EFF_RCV_HASH_BITS 10
@@ -65,4 +66,15 @@ struct can_ml_priv {
 #endif
 };
 
+static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev)
+{
+	return netdev_get_ml_priv(dev, ML_PRIV_CAN);
+}
+
+static inline void can_set_ml_priv(struct net_device *dev,
+				   struct can_ml_priv *ml_priv)
+{
+	netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN);
+}
+
 #endif /* CAN_ML_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ddf4cfc12615..f06fbee8638e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1584,6 +1584,12 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
 
+/* Specifies the type of the struct net_device::ml_priv pointer */
+enum netdev_ml_priv_type {
+	ML_PRIV_NONE,
+	ML_PRIV_CAN,
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -1779,6 +1785,7 @@ enum netdev_priv_flags {
  * 	@nd_net:		Network namespace this network device is inside
  *
  * 	@ml_priv:	Mid-layer private
+ *	@ml_priv_type:  Mid-layer private type
  * 	@lstats:	Loopback statistics
  * 	@tstats:	Tunnel statistics
  * 	@dstats:	Dummy statistics
@@ -2094,8 +2101,10 @@ struct net_device {
 	possible_net_t			nd_net;
 
 	/* mid-layer private */
+	void				*ml_priv;
+	enum netdev_ml_priv_type	ml_priv_type;
+
 	union {
-		void					*ml_priv;
 		struct pcpu_lstats __percpu		*lstats;
 		struct pcpu_sw_netstats __percpu	*tstats;
 		struct pcpu_dstats __percpu		*dstats;
@@ -2286,6 +2295,29 @@ static inline void netdev_reset_rx_headroom(struct net_device *dev)
 	netdev_set_rx_headroom(dev, -1);
 }
 
+static inline void *netdev_get_ml_priv(struct net_device *dev,
+				       enum netdev_ml_priv_type type)
+{
+	if (dev->ml_priv_type != type)
+		return NULL;
+
+	return dev->ml_priv;
+}
+
+static inline void netdev_set_ml_priv(struct net_device *dev,
+				      void *ml_priv,
+				      enum netdev_ml_priv_type type)
+{
+	WARN(dev->ml_priv_type && dev->ml_priv_type != type,
+	     "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
+	     dev->ml_priv_type, type);
+	WARN(!dev->ml_priv_type && dev->ml_priv,
+	     "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");
+
+	dev->ml_priv = ml_priv;
+	dev->ml_priv_type = type;
+}
+
 /*
  * Net namespace inlines
  */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 837bb8af0ec3..cce2af10eb3e 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -304,8 +304,8 @@ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
 							struct net_device *dev)
 {
 	if (dev) {
-		struct can_ml_priv *ml_priv = dev->ml_priv;
-		return &ml_priv->dev_rcv_lists;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+		return &can_ml->dev_rcv_lists;
 	} else {
 		return net->can.rx_alldev_list;
 	}
@@ -790,25 +790,6 @@ void can_proto_unregister(const struct can_proto *cp)
 }
 EXPORT_SYMBOL(can_proto_unregister);
 
-/* af_can notifier to create/remove CAN netdevice specific structs */
-static int can_notifier(struct notifier_block *nb, unsigned long msg,
-			void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (dev->type != ARPHRD_CAN)
-		return NOTIFY_DONE;
-
-	switch (msg) {
-	case NETDEV_REGISTER:
-		WARN(!dev->ml_priv,
-		     "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n");
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
 static int can_pernet_init(struct net *net)
 {
 	spin_lock_init(&net->can.rcvlists_lock);
@@ -876,11 +857,6 @@ static const struct net_proto_family can_family_ops = {
 	.owner  = THIS_MODULE,
 };
 
-/* notifier block for netdevice event */
-static struct notifier_block can_netdev_notifier __read_mostly = {
-	.notifier_call = can_notifier,
-};
-
 static struct pernet_operations can_pernet_ops __read_mostly = {
 	.init = can_pernet_init,
 	.exit = can_pernet_exit,
@@ -911,17 +887,12 @@ static __init int can_init(void)
 	err = sock_register(&can_family_ops);
 	if (err)
 		goto out_sock;
-	err = register_netdevice_notifier(&can_netdev_notifier);
-	if (err)
-		goto out_notifier;
 
 	dev_add_pack(&can_packet);
 	dev_add_pack(&canfd_packet);
 
 	return 0;
 
-out_notifier:
-	sock_unregister(PF_CAN);
 out_sock:
 	unregister_pernet_subsys(&can_pernet_ops);
 out_pernet:
@@ -935,7 +906,6 @@ static __exit void can_exit(void)
 	/* protocol unregister */
 	dev_remove_pack(&canfd_packet);
 	dev_remove_pack(&can_packet);
-	unregister_netdevice_notifier(&can_netdev_notifier);
 	sock_unregister(PF_CAN);
 
 	unregister_pernet_subsys(&can_pernet_ops);
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index bb914d8b4216..da3a7a7bcff2 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -140,9 +140,9 @@ static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
 static inline void j1939_priv_set(struct net_device *ndev,
 				  struct j1939_priv *priv)
 {
-	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 
-	can_ml_priv->j1939_priv = priv;
+	can_ml->j1939_priv = priv;
 }
 
 static void __j1939_priv_release(struct kref *kref)
@@ -211,12 +211,9 @@ static void __j1939_rx_release(struct kref *kref)
 /* get pointer to priv without increasing ref counter */
 static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
 {
-	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 
-	if (!can_ml_priv)
-		return NULL;
-
-	return can_ml_priv->j1939_priv;
+	return can_ml->j1939_priv;
 }
 
 static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
@@ -225,9 +222,6 @@ static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
 
 	lockdep_assert_held(&j1939_netdev_lock);
 
-	if (ndev->type != ARPHRD_CAN)
-		return NULL;
-
 	priv = j1939_ndev_to_priv(ndev);
 	if (priv)
 		j1939_priv_get(priv);
@@ -348,15 +342,16 @@ static int j1939_netdev_notify(struct notifier_block *nb,
 			       unsigned long msg, void *data)
 {
 	struct net_device *ndev = netdev_notifier_info_to_dev(data);
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 	struct j1939_priv *priv;
 
+	if (!can_ml)
+		goto notify_done;
+
 	priv = j1939_priv_get_by_ndev(ndev);
 	if (!priv)
 		goto notify_done;
 
-	if (ndev->type != ARPHRD_CAN)
-		goto notify_put;
-
 	switch (msg) {
 	case NETDEV_DOWN:
 		j1939_cancel_active_session(priv, NULL);
@@ -365,7 +360,6 @@ static int j1939_netdev_notify(struct notifier_block *nb,
 		break;
 	}
 
-notify_put:
 	j1939_priv_put(priv);
 
 notify_done:
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index f23966526a88..56aa66147d5a 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/can/can-ml.h>
 #include <linux/can/core.h>
 #include <linux/can/skb.h>
 #include <linux/errqueue.h>
@@ -453,6 +454,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 		j1939_jsk_del(priv, jsk);
 		j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
 	} else {
+		struct can_ml_priv *can_ml;
 		struct net_device *ndev;
 
 		ndev = dev_get_by_index(net, addr->can_ifindex);
@@ -461,15 +463,8 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 			goto out_release_sock;
 		}
 
-		if (ndev->type != ARPHRD_CAN) {
-			dev_put(ndev);
-			ret = -ENODEV;
-			goto out_release_sock;
-		}
-
-		if (!ndev->ml_priv) {
-			netdev_warn_once(ndev,
-					 "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n");
+		can_ml = can_get_ml_priv(ndev);
+		if (!can_ml) {
 			dev_put(ndev);
 			ret = -ENODEV;
 			goto out_release_sock;
diff --git a/net/can/proc.c b/net/can/proc.c
index 5ea8695f507e..b15760b5c1cc 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -322,8 +322,11 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 
 	/* receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv)
-			can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml)
+			can_rcvlist_proc_show_one(m, idx, dev,
+						  &can_ml->dev_rcv_lists);
 	}
 
 	rcu_read_unlock();
@@ -375,8 +378,10 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 
 	/* sff receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-			dev_rcv_lists = dev->ml_priv;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml) {
+			dev_rcv_lists = &can_ml->dev_rcv_lists;
 			can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff,
 						    ARRAY_SIZE(dev_rcv_lists->rx_sff));
 		}
@@ -406,8 +411,10 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 
 	/* eff receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-			dev_rcv_lists = dev->ml_priv;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml) {
+			dev_rcv_lists = &can_ml->dev_rcv_lists;
 			can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff,
 						    ARRAY_SIZE(dev_rcv_lists->rx_eff));
 		}
-- 
cgit v1.2.3


From d814567942ff6ac73869052bdb8ca911364e5eb0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 25 Feb 2021 12:49:58 -0500
Subject: mm, tracing: Fix kmem_cache_free trace event to not print stale
 pointers

The update to kmem_cache_free trace event added printing of the slab name in
the trace event. But it only stores the pointer of the name which will be
printed as a string when the event is read some time in the future. This is
dangerous because the name could be freed in the mean time and when reading
the trace event it would try to dereference the string name by the pointer
to the name that has been freed.

Instead, use the trace event helper macros __string(), __assign_str(), and
__get_str() that are for this very case.

Cc: Jacob Wen <jian.w.wen@oracle.com>
Fixes: 3544de8ee6e4 ("mm, tracing: record slab name for kmem_cache_free()")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/events/kmem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 40845b0d5dad..3a60b6b6db32 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -144,17 +144,17 @@ TRACE_EVENT(kmem_cache_free,
 	TP_STRUCT__entry(
 		__field(	unsigned long,	call_site	)
 		__field(	const void *,	ptr		)
-		__field(	const char *,	name		)
+		__string(	name,	name	)
 	),
 
 	TP_fast_assign(
 		__entry->call_site	= call_site;
 		__entry->ptr		= ptr;
-		__entry->name		= name;
+		__assign_str(name, name);
 	),
 
 	TP_printk("call_site=%pS ptr=%p name=%s",
-		  (void *)__entry->call_site, __entry->ptr, __entry->name)
+		  (void *)__entry->call_site, __entry->ptr, __get_str(name))
 );
 
 TRACE_EVENT(mm_page_free,
-- 
cgit v1.2.3


From f5b6a74d9c08b19740ca056876bf6584acdba582 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 29 Jan 2021 17:46:51 -0700
Subject: vmlinux.lds.h: Define SANTIZER_DISCARDS with CONFIG_GCOV_KERNEL=y

clang produces .eh_frame sections when CONFIG_GCOV_KERNEL is enabled,
even when -fno-asynchronous-unwind-tables is in KBUILD_CFLAGS:

$ make CC=clang vmlinux
...
ld: warning: orphan section `.eh_frame' from `init/main.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/version.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/do_mounts.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/do_mounts_initrd.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/initramfs.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/calibrate.o' being placed in section `.eh_frame'
ld: warning: orphan section `.eh_frame' from `init/init_task.o' being placed in section `.eh_frame'
...

$ rg "GCOV_KERNEL|GCOV_PROFILE_ALL" .config
CONFIG_GCOV_KERNEL=y
CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y
CONFIG_GCOV_PROFILE_ALL=y

This was already handled for a couple of other options in
commit d812db78288d ("vmlinux.lds.h: Avoid KASAN and KCSAN's unwanted
sections") and there is an open LLVM bug for this issue. Take advantage
of that section for this config as well so that there are no more orphan
warnings.

Link: https://bugs.llvm.org/show_bug.cgi?id=46478
Link: https://github.com/ClangBuiltLinux/linux/issues/1069
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Fangrui Song <maskray@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Fixes: d812db78288d ("vmlinux.lds.h: Avoid KASAN and KCSAN's unwanted sections")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210130004650.2682422-1-nathan@kernel.org
---
 include/asm-generic/vmlinux.lds.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b97c628ad91f..ec89e657ea2f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -988,12 +988,13 @@
 #endif
 
 /*
- * Clang's -fsanitize=kernel-address and -fsanitize=thread produce
- * unwanted sections (.eh_frame and .init_array.*), but
- * CONFIG_CONSTRUCTORS wants to keep any .init_array.* sections.
+ * Clang's -fprofile-arcs, -fsanitize=kernel-address, and
+ * -fsanitize=thread produce unwanted sections (.eh_frame
+ * and .init_array.*), but CONFIG_CONSTRUCTORS wants to
+ * keep any .init_array.* sections.
  * https://bugs.llvm.org/show_bug.cgi?id=46478
  */
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN)
+#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN)
 # ifdef CONFIG_CONSTRUCTORS
 #  define SANITIZER_DISCARDS						\
 	*(.eh_frame)
-- 
cgit v1.2.3


From 44835d20b2a0c9b4c0c3fb96e90f4e2fd4a4e41d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:15:36 -0800
Subject: mm: add FGP_ENTRY

The functionality of find_lock_entry() and find_get_entry() can be
provided by pagecache_get_page(), which lets us delete find_lock_entry()
and make find_get_entry() static.

Link: https://lkml.kernel.org/r/20201112212641.27837-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 44 ++++++++------------------------------------
 mm/internal.h           |  3 ---
 mm/shmem.c              |  3 ++-
 mm/swap_state.c         |  3 ++-
 5 files changed, 13 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bd629d676a27..b379b2388202 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -315,6 +315,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_NOWAIT		0x00000020
 #define FGP_FOR_MMAP		0x00000040
 #define FGP_HEAD		0x00000080
+#define FGP_ENTRY		0x00000100
 
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 		int fgp_flags, gfp_t cache_gfp_mask);
diff --git a/mm/filemap.c b/mm/filemap.c
index 57eae5163bce..84b7813badf1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1658,7 +1658,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 }
 EXPORT_SYMBOL(page_cache_prev_miss);
 
-/**
+/*
  * find_get_entry - find and get a page cache entry
  * @mapping: the address_space to search
  * @index: The page cache index.
@@ -1671,7 +1671,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  *
  * Return: The head page or shadow entry, %NULL if nothing is found.
  */
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+static struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
 	struct page *page;
@@ -1707,39 +1707,6 @@ out:
 	return page;
 }
 
-/**
- * find_lock_entry - Locate and lock a page cache entry.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * Looks up the page at @mapping & @index.  If there is a page in the
- * cache, the head page is returned locked and with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Context: May sleep.
- * Return: The head page or shadow entry, %NULL if nothing is found.
- */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
-{
-	struct page *page;
-
-repeat:
-	page = find_get_entry(mapping, index);
-	if (page && !xa_is_value(page)) {
-		lock_page(page);
-		/* Has the page been truncated? */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
-		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
-	}
-	return page;
-}
-
 /**
  * pagecache_get_page - Find and get a reference to a page.
  * @mapping: The address_space to search.
@@ -1755,6 +1722,8 @@ repeat:
  * * %FGP_LOCK - The page is returned locked.
  * * %FGP_HEAD - If the page is present and a THP, return the head page
  *   rather than the exact page specified by the index.
+ * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
+ *   instead of allocating a new page to replace it.
  * * %FGP_CREAT - If no page is present then a new page is allocated using
  *   @gfp_mask and added to the page cache and the VM's LRU list.
  *   The page is returned locked and with an increased refcount.
@@ -1779,8 +1748,11 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 
 repeat:
 	page = find_get_entry(mapping, index);
-	if (xa_is_value(page))
+	if (xa_is_value(page)) {
+		if (fgp_flags & FGP_ENTRY)
+			return page;
 		page = NULL;
+	}
 	if (!page)
 		goto no_page;
 
diff --git a/mm/internal.h b/mm/internal.h
index 25d2b2439f19..eed74f1e6147 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -60,9 +60,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
 	force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
 }
 
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
-
 /**
  * page_evictable - test whether a page is evictable
  * @page: the page to test
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ea1fa53db3f..bd5bb78128af 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1812,7 +1812,8 @@ repeat:
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = vma ? vma->vm_mm : current->mm;
 
-	page = find_lock_entry(mapping, index);
+	page = pagecache_get_page(mapping, index,
+					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
 	if (xa_is_value(page)) {
 		error = shmem_swapin_page(inode, index, &page,
 					  sgp, gfp, vma, fault_type);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f270c30d4681..3cdee7b11da9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -403,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
 {
 	swp_entry_t swp;
 	struct swap_info_struct *si;
-	struct page *page = find_get_entry(mapping, index);
+	struct page *page = pagecache_get_page(mapping, index,
+						FGP_ENTRY | FGP_HEAD, 0);
 
 	if (!page)
 		return page;
-- 
cgit v1.2.3


From 41139aa4c3a31ee7e072fc63353c74035aade2ff Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:15:48 -0800
Subject: mm/filemap: add mapping_seek_hole_data

Rewrite shmem_seek_hole_data() and move it to filemap.c.

[willy@infradead.org: don't put an xa_is_value() page]
  Link: https://lkml.kernel.org/r/20201124041507.28996-4-willy@infradead.org

Link: https://lkml.kernel.org/r/20201112212641.27837-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 76 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c              | 74 +++--------------------------------------------
 3 files changed, 82 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b379b2388202..3608993428d9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -760,6 +760,8 @@ extern void __delete_from_page_cache(struct page *page, void *shadow);
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
+loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
+		int whence);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
diff --git a/mm/filemap.c b/mm/filemap.c
index 21443850aeae..eff3006be12a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2553,6 +2553,82 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 }
 EXPORT_SYMBOL(generic_file_read_iter);
 
+static inline bool page_seek_match(struct page *page, bool seek_data)
+{
+	if (xa_is_value(page) || PageUptodate(page))
+		return seek_data;
+	return !seek_data;
+}
+
+static inline
+unsigned int seek_page_size(struct xa_state *xas, struct page *page)
+{
+	if (xa_is_value(page))
+		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+	return thp_size(page);
+}
+
+/**
+ * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ * @mapping: Address space to search.
+ * @start: First byte to consider.
+ * @end: Limit of search (exclusive).
+ * @whence: Either SEEK_HOLE or SEEK_DATA.
+ *
+ * If the page cache knows which blocks contain holes and which blocks
+ * contain data, your filesystem can use this function to implement
+ * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
+ * entirely memory-based such as tmpfs, and filesystems which support
+ * unwritten extents.
+ *
+ * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * SEEK_DATA and there is no data after @start.  There is an implicit hole
+ * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
+ * and @end contain data.
+ */
+loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+		loff_t end, int whence)
+{
+	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
+	pgoff_t max = (end - 1) / PAGE_SIZE;
+	bool seek_data = (whence == SEEK_DATA);
+	struct page *page;
+
+	if (end <= start)
+		return -ENXIO;
+
+	rcu_read_lock();
+	while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
+		loff_t pos = xas.xa_index * PAGE_SIZE;
+
+		if (start < pos) {
+			if (!seek_data)
+				goto unlock;
+			start = pos;
+		}
+
+		if (page_seek_match(page, seek_data))
+			goto unlock;
+		start = pos + seek_page_size(&xas, page);
+		if (!xa_is_value(page))
+			put_page(page);
+	}
+	rcu_read_unlock();
+
+	if (seek_data)
+		return -ENXIO;
+	goto out;
+
+unlock:
+	rcu_read_unlock();
+	if (!xa_is_value(page))
+		put_page(page);
+out:
+	if (start > end)
+		return end;
+	return start;
+}
+
 #ifdef CONFIG_MMU
 #define MMAP_LOTSAMISS  (100)
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bd5bb78128af..deb22e128435 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2668,86 +2668,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return retval ? retval : error;
 }
 
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the page cache.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-				    pgoff_t index, pgoff_t end, int whence)
-{
-	struct page *page;
-	struct pagevec pvec;
-	pgoff_t indices[PAGEVEC_SIZE];
-	bool done = false;
-	int i;
-
-	pagevec_init(&pvec);
-	pvec.nr = 1;		/* start small: we may be there already */
-	while (!done) {
-		pvec.nr = find_get_entries(mapping, index,
-					pvec.nr, pvec.pages, indices);
-		if (!pvec.nr) {
-			if (whence == SEEK_DATA)
-				index = end;
-			break;
-		}
-		for (i = 0; i < pvec.nr; i++, index++) {
-			if (index < indices[i]) {
-				if (whence == SEEK_HOLE) {
-					done = true;
-					break;
-				}
-				index = indices[i];
-			}
-			page = pvec.pages[i];
-			if (page && !xa_is_value(page)) {
-				if (!PageUptodate(page))
-					page = NULL;
-			}
-			if (index >= end ||
-			    (page && whence == SEEK_DATA) ||
-			    (!page && whence == SEEK_HOLE)) {
-				done = true;
-				break;
-			}
-		}
-		pagevec_remove_exceptionals(&pvec);
-		pagevec_release(&pvec);
-		pvec.nr = PAGEVEC_SIZE;
-		cond_resched();
-	}
-	return index;
-}
-
 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	pgoff_t start, end;
-	loff_t new_offset;
 
 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
 		return generic_file_llseek_size(file, offset, whence,
 					MAX_LFS_FILESIZE, i_size_read(inode));
+	if (offset < 0)
+		return -ENXIO;
+
 	inode_lock(inode);
 	/* We're holding i_mutex so we can access i_size directly */
-
-	if (offset < 0 || offset >= inode->i_size)
-		offset = -ENXIO;
-	else {
-		start = offset >> PAGE_SHIFT;
-		end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
-		new_offset <<= PAGE_SHIFT;
-		if (new_offset > offset) {
-			if (new_offset < inode->i_size)
-				offset = new_offset;
-			else if (whence == SEEK_DATA)
-				offset = -ENXIO;
-			else
-				offset = inode->i_size;
-		}
-	}
-
+	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
 	if (offset >= 0)
 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
 	inode_unlock(inode);
-- 
cgit v1.2.3


From ca122fe40eb463c8c11c3bfc1914f0048ca5c268 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:00 -0800
Subject: mm: add an 'end' parameter to find_get_entries

This simplifies the callers and leads to a more efficient implementation
since the XArray has this functionality already.

Link: https://lkml.kernel.org/r/20201112212641.27837-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  4 ++--
 mm/filemap.c            |  9 +++++----
 mm/shmem.c              | 10 ++--------
 mm/swap.c               |  2 +-
 4 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3608993428d9..fdb2c4e44851 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 }
 
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-			  unsigned int nr_entries, struct page **entries,
-			  pgoff_t *indices);
+		pgoff_t end, unsigned int nr_entries, struct page **entries,
+		pgoff_t *indices);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index 61fdcdc75275..65cfdff17ac6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1865,6 +1865,7 @@ reset:
  * find_get_entries - gang pagecache lookup
  * @mapping:	The address_space to search
  * @start:	The starting page cache index
+ * @end:	The final page index (inclusive).
  * @nr_entries:	The maximum number of entries
  * @entries:	Where the resulting entries are placed
  * @indices:	The cache indices corresponding to the entries in @entries
@@ -1888,9 +1889,9 @@ reset:
  *
  * Return: the number of pages and shadow entries which were found.
  */
-unsigned find_get_entries(struct address_space *mapping,
-			  pgoff_t start, unsigned int nr_entries,
-			  struct page **entries, pgoff_t *indices)
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+		pgoff_t end, unsigned int nr_entries, struct page **entries,
+		pgoff_t *indices)
 {
 	XA_STATE(xas, &mapping->i_pages, start);
 	struct page *page;
@@ -1900,7 +1901,7 @@ unsigned find_get_entries(struct address_space *mapping,
 		return 0;
 
 	rcu_read_lock();
-	while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) {
+	while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
 		/*
 		 * Terminate early on finding a THP, to allow the caller to
 		 * handle it all at once; but continue if this is hugetlbfs.
diff --git a/mm/shmem.c b/mm/shmem.c
index 86b1f5bc502c..4aac760aa2d4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -913,8 +913,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index >= end)
-				break;
 
 			if (xa_is_value(page)) {
 				if (unfalloc)
@@ -967,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		pvec.nr = find_get_entries(mapping, index,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE),
-				pvec.pages, indices);
+		pvec.nr = find_get_entries(mapping, index, end - 1,
+				PAGEVEC_SIZE, pvec.pages, indices);
 		if (!pvec.nr) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
@@ -982,9 +979,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index >= end)
-				break;
-
 			if (xa_is_value(page)) {
 				if (unfalloc)
 					continue;
diff --git a/mm/swap.c b/mm/swap.c
index ab3258afcbeb..c5773a84feab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1046,7 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t start, unsigned nr_entries,
 				pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
-- 
cgit v1.2.3


From 31d270fd98d196578223e5b568a0bd3bc6028b09 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:03 -0800
Subject: mm: add an 'end' parameter to pagevec_lookup_entries

Simplifies the callers and uses the existing functionality in
find_get_entries().  We can also drop the final argument of
truncate_exceptional_pvec_entries() and simplify the logic in that
function.

Link: https://lkml.kernel.org/r/20201112212641.27837-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  5 ++---
 mm/swap.c               |  8 ++++----
 mm/truncate.c           | 41 ++++++++++-------------------------------
 3 files changed, 16 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index ad4ddc17d403..f70a9dc81504 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,9 +26,8 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
-				struct address_space *mapping,
-				pgoff_t start, unsigned nr_entries,
-				pgoff_t *indices);
+		struct address_space *mapping, pgoff_t start, pgoff_t end,
+		unsigned nr_entries, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index c5773a84feab..db8c354264a5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,6 +1022,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
  * @pvec:	Where the resulting entries are placed
  * @mapping:	The address_space to search
  * @start:	The starting entry index
+ * @end:	The highest index to return (inclusive).
  * @nr_entries:	The maximum number of pages
  * @indices:	The cache indices corresponding to the entries in @pvec
  *
@@ -1042,11 +1043,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
  * found.
  */
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
-				struct address_space *mapping,
-				pgoff_t start, unsigned nr_entries,
-				pgoff_t *indices)
+		struct address_space *mapping, pgoff_t start, pgoff_t end,
+		unsigned nr_entries, pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, end, nr_entries,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index de7f4f47f780..60df23890c2d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
  * exceptional entries similar to what pagevec_remove_exceptionals does.
  */
 static void truncate_exceptional_pvec_entries(struct address_space *mapping,
-				struct pagevec *pvec, pgoff_t *indices,
-				pgoff_t end)
+				struct pagevec *pvec, pgoff_t *indices)
 {
 	int i, j;
-	bool dax, lock;
+	bool dax;
 
 	/* Handled by shmem itself */
 	if (shmem_mapping(mapping))
@@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 		return;
 
 	dax = dax_mapping(mapping);
-	lock = !dax && indices[j] < end;
-	if (lock)
+	if (!dax)
 		xa_lock_irq(&mapping->i_pages);
 
 	for (i = j; i < pagevec_count(pvec); i++) {
@@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 			continue;
 		}
 
-		if (index >= end)
-			continue;
-
 		if (unlikely(dax)) {
 			dax_delete_mapping_entry(mapping, index);
 			continue;
@@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 		__clear_shadow_entry(mapping, index, page);
 	}
 
-	if (lock)
+	if (!dax)
 		xa_unlock_irq(&mapping->i_pages);
 	pvec->nr = j;
 }
@@ -329,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	while (index < end && find_lock_entries(mapping, index, end - 1,
 			&pvec, indices)) {
 		index = indices[pagevec_count(&pvec) - 1] + 1;
-		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 		for (i = 0; i < pagevec_count(&pvec); i++)
 			truncate_cleanup_page(mapping, pvec.pages[i]);
 		delete_from_page_cache_batch(mapping, &pvec);
@@ -381,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup_entries(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
+				PAGEVEC_SIZE, indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
 				break;
@@ -390,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			index = start;
 			continue;
 		}
-		if (index == start && indices[0] >= end) {
-			/* All gone out of hole to be punched, we're done */
-			pagevec_remove_exceptionals(&pvec);
-			pagevec_release(&pvec);
-			break;
-		}
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index >= end) {
-				/* Restart punch to make sure all gone */
-				index = start - 1;
-				break;
-			}
 
 			if (xa_is_value(page))
 				continue;
@@ -417,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			truncate_inode_page(mapping, page);
 			unlock_page(page);
 		}
-		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 		pagevec_release(&pvec);
 		index++;
 	}
@@ -513,8 +497,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index > end)
-				break;
 
 			if (xa_is_value(page)) {
 				invalidate_exceptional_entry(mapping, index,
@@ -656,16 +638,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
-			indices)) {
+	while (pagevec_lookup_entries(&pvec, mapping, index, end,
+			PAGEVEC_SIZE, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index > end)
-				break;
 
 			if (xa_is_value(page)) {
 				if (!invalidate_exceptional_entry2(mapping,
-- 
cgit v1.2.3


From 38cefeb33749992ceaad6ea40e12f92aa8f8e28f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:07 -0800
Subject: mm: remove nr_entries parameter from pagevec_lookup_entries

All callers want to fetch the full size of the pvec.

Link: https://lkml.kernel.org/r/20201112212641.27837-13-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 2 +-
 mm/swap.c               | 4 ++--
 mm/truncate.c           | 5 ++---
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index f70a9dc81504..72c5ea2e708d 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,7 +27,7 @@ void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		unsigned nr_entries, pgoff_t *indices);
+		pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index db8c354264a5..cd9e1ed7e78f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1044,9 +1044,9 @@ void __pagevec_lru_add(struct pagevec *pvec)
  */
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		unsigned nr_entries, pgoff_t *indices)
+		pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, end, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 60df23890c2d..41e7377ad58d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -377,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	for ( ; ; ) {
 		cond_resched();
 		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
-				PAGEVEC_SIZE, indices)) {
+				indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
 				break;
@@ -638,8 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (pagevec_lookup_entries(&pvec, mapping, index, end,
-			PAGEVEC_SIZE, indices)) {
+	while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-- 
cgit v1.2.3


From cf2039af1a2eee58fdbfa68bc0c9123e77477645 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:11 -0800
Subject: mm: pass pvec directly to find_get_entries

All callers of find_get_entries() use a pvec, so pass it directly instead
of manipulating it in the caller.

Link: https://lkml.kernel.org/r/20201112212641.27837-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  3 +--
 mm/filemap.c            | 21 +++++++++------------
 mm/shmem.c              |  5 ++---
 mm/swap.c               |  4 +---
 4 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fdb2c4e44851..20225b067583 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 }
 
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-		pgoff_t end, unsigned int nr_entries, struct page **entries,
-		pgoff_t *indices);
+		pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index 65cfdff17ac6..43700480d897 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1866,14 +1866,12 @@ reset:
  * @mapping:	The address_space to search
  * @start:	The starting page cache index
  * @end:	The final page index (inclusive).
- * @nr_entries:	The maximum number of entries
- * @entries:	Where the resulting entries are placed
+ * @pvec:	Where the resulting entries are placed.
  * @indices:	The cache indices corresponding to the entries in @entries
  *
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping.  The entries are placed at
- * @entries.  find_get_entries() takes a reference against any actual
- * pages it returns.
+ * find_get_entries() will search for and return a batch of entries in
+ * the mapping.  The entries are placed in @pvec.  find_get_entries()
+ * takes a reference on any actual pages it returns.
  *
  * The search returns a group of mapping-contiguous page cache entries
  * with ascending indexes.  There may be holes in the indices due to
@@ -1890,15 +1888,12 @@ reset:
  * Return: the number of pages and shadow entries which were found.
  */
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-		pgoff_t end, unsigned int nr_entries, struct page **entries,
-		pgoff_t *indices)
+		pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
 {
 	XA_STATE(xas, &mapping->i_pages, start);
 	struct page *page;
 	unsigned int ret = 0;
-
-	if (!nr_entries)
-		return 0;
+	unsigned nr_entries = PAGEVEC_SIZE;
 
 	rcu_read_lock();
 	while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
@@ -1913,11 +1908,13 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 		}
 
 		indices[ret] = xas.xa_index;
-		entries[ret] = page;
+		pvec->pages[ret] = page;
 		if (++ret == nr_entries)
 			break;
 	}
 	rcu_read_unlock();
+
+	pvec->nr = ret;
 	return ret;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 4aac760aa2d4..ee8f21832f98 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -965,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		pvec.nr = find_get_entries(mapping, index, end - 1,
-				PAGEVEC_SIZE, pvec.pages, indices);
-		if (!pvec.nr) {
+		if (!find_get_entries(mapping, index, end - 1, &pvec,
+				indices)) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
 				break;
diff --git a/mm/swap.c b/mm/swap.c
index cd9e1ed7e78f..d20a746a831e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1046,9 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
 		pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE,
-				    pvec->pages, indices);
-	return pagevec_count(pvec);
+	return find_get_entries(mapping, start, end, pvec, indices);
 }
 
 /**
-- 
cgit v1.2.3


From a656a20241f08be532539c7d5bd82df741c2d487 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:14 -0800
Subject: mm: remove pagevec_lookup_entries

pagevec_lookup_entries() is now just a wrapper around find_get_entries()
so remove it and convert all its callers.

Link: https://lkml.kernel.org/r/20201112212641.27837-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  3 ---
 mm/swap.c               | 36 ++----------------------------------
 mm/truncate.c           |  4 ++--
 3 files changed, 4 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 72c5ea2e708d..7f3f19065a9f 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -25,9 +25,6 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index d20a746a831e..31b844d4ed94 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1017,44 +1017,12 @@ void __pagevec_lru_add(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-/**
- * pagevec_lookup_entries - gang pagecache lookup
- * @pvec:	Where the resulting entries are placed
- * @mapping:	The address_space to search
- * @start:	The starting entry index
- * @end:	The highest index to return (inclusive).
- * @nr_entries:	The maximum number of pages
- * @indices:	The cache indices corresponding to the entries in @pvec
- *
- * pagevec_lookup_entries() will search for and return a group of up
- * to @nr_pages pages and shadow entries in the mapping.  All
- * entries are placed in @pvec.  pagevec_lookup_entries() takes a
- * reference against actual pages in @pvec.
- *
- * The search returns a group of mapping-contiguous entries with
- * ascending indexes.  There may be holes in the indices due to
- * not-present entries.
- *
- * Only one subpage of a Transparent Huge Page is returned in one call:
- * allowing truncate_inode_pages_range() to evict the whole THP without
- * cycling through a pagevec of extra references.
- *
- * pagevec_lookup_entries() returns the number of entries which were
- * found.
- */
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		pgoff_t *indices)
-{
-	return find_get_entries(mapping, start, end, pvec, indices);
-}
-
 /**
  * pagevec_remove_exceptionals - pagevec exceptionals pruning
  * @pvec:	The pagevec to prune
  *
- * pagevec_lookup_entries() fills both pages and exceptional radix
- * tree entries into the pagevec.  This function prunes all
+ * find_get_entries() fills both pages and XArray value entries (aka
+ * exceptional entries) into the pagevec.  This function prunes all
  * exceptionals from @pvec without leaving holes, so that it can be
  * passed on to page-only pagevec operations.
  */
diff --git a/mm/truncate.c b/mm/truncate.c
index 41e7377ad58d..455944264663 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -376,7 +376,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
+		if (!find_get_entries(mapping, index, end - 1, &pvec,
 				indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
@@ -638,7 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) {
+	while (find_get_entries(mapping, index, end, &pvec, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-- 
cgit v1.2.3


From 164cc4fef4456727466f8e35bb654c3994748070 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 25 Feb 2021 17:16:18 -0800
Subject: mm,thp,shmem: limit shmem THP alloc gfp_mask

Patch series "mm,thp,shm: limit shmem THP alloc gfp_mask", v6.

The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and
compaction code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck on
the LRU lock in the page reclaim code, trying to allocate dozens of THPs
simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result in
quick allocation failures without direct reclaim when no 2MB free pages
are available.

With this patch applied, THP allocations for tmpfs will be a little more
aggressive than today for files mmapped with MADV_HUGEPAGE, and a little
less aggressive for files that are not mmapped or mapped without that
flag.

This patch (of 4):

The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and
compaction code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck on
the LRU lock in the page reclaim code, trying to allocate dozens of THPs
simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

Controlling the gfp_mask of THP allocations through the knobs in sysfs
allows users to determine the balance between how aggressively the system
tries to allocate THPs at fault time, and how much the application may end
up stalling attempting those allocations.

This way a THP defrag setting of "never" or "defer+madvise" will result in
quick allocation failures without direct reclaim when no 2MB free pages
are available.

With this patch applied, THP allocations for tmpfs will be a little more
aggressive than today for files mmapped with MADV_HUGEPAGE, and a little
less aggressive for files that are not mmapped or mapped without that
flag.

Link: https://lkml.kernel.org/r/20201124194925.623931-1-riel@surriel.com
Link: https://lkml.kernel.org/r/20201124194925.623931-2-riel@surriel.com
Signed-off-by: Rik van Riel <riel@surriel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Xu Yu <xuyu@linux.alibaba.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 mm/huge_memory.c    | 6 +++---
 mm/shmem.c          | 8 +++++---
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 220cd553a9e7..8572a1474e16 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -634,6 +634,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d77605c30f2e..395c75111d33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -668,9 +668,9 @@ release:
  *	    available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
 {
-	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
 	/* Always do synchronous compaction */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
@@ -762,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		}
 		return ret;
 	}
-	gfp = alloc_hugepage_direct_gfpmask(vma);
+	gfp = vma_thp_gfp_mask(vma);
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index ee8f21832f98..596009a44431 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1519,8 +1519,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
 		return NULL;
 
 	shmem_pseudo_vma_init(&pvma, info, hindex);
-	page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
+			       true);
 	shmem_pseudo_vma_destroy(&pvma);
 	if (page)
 		prep_transhuge_page(page);
@@ -1776,6 +1776,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page *page;
 	enum sgp_type sgp_huge = sgp;
 	pgoff_t hindex = index;
+	gfp_t huge_gfp;
 	int error;
 	int once = 0;
 	int alloced = 0;
@@ -1862,7 +1863,8 @@ repeat:
 	}
 
 alloc_huge:
-	page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+	huge_gfp = vma_thp_gfp_mask(vma);
+	page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
 	if (IS_ERR(page)) {
 alloc_nohuge:
 		page = shmem_alloc_and_acct_page(gfp, inode,
-- 
cgit v1.2.3


From cd89fb06509903f942a0ffe97ffa63034671ed0c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 25 Feb 2021 17:16:25 -0800
Subject: mm,thp,shmem: make khugepaged obey tmpfs mount flags

Currently if thp enabled=[madvise], mounting a tmpfs filesystem with
huge=always and mmapping files from that tmpfs does not result in
khugepaged collapsing those mappings, despite the mount flag indicating
that it should.

Fix that by breaking up the blocks of tests in hugepage_vma_check a little
bit, and testing things in the correct order.

Link: https://lkml.kernel.org/r/20201124194925.623931-4-riel@surriel.com
Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem")
Signed-off-by: Rik van Riel <riel@surriel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xu Yu <xuyu@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/khugepaged.h |  2 ++
 mm/khugepaged.c            | 22 ++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index c941b7377321..2fcc01891b47 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -3,6 +3,7 @@
 #define _LINUX_KHUGEPAGED_H
 
 #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+#include <linux/shmem_fs.h>
 
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
+		     (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
 		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
 		    !(vm_flags & VM_NOHUGEPAGE) &&
 		    !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 75e246f680f4..a7d6cb912b05 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
 static bool hugepage_vma_check(struct vm_area_struct *vma,
 			       unsigned long vm_flags)
 {
-	if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-	    (vm_flags & VM_NOHUGEPAGE) ||
+	/* Explicitly disabled through madvise. */
+	if ((vm_flags & VM_NOHUGEPAGE) ||
 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
 
-	if (shmem_file(vma->vm_file) ||
-	    (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
-	     vma->vm_file &&
-	     (vm_flags & VM_DENYWRITE))) {
+	/* Enabled via shmem mount options or sysfs settings. */
+	if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
 		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
 				HPAGE_PMD_NR);
 	}
+
+	/* THP settings require madvise. */
+	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		return false;
+
+	/* Read-only file mappings need to be aligned for THP to work. */
+	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+	    (vm_flags & VM_DENYWRITE)) {
+		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+				HPAGE_PMD_NR);
+	}
+
 	if (!vma->anon_vma || vma->vm_ops)
 		return false;
 	if (vma_is_temporary_stack(vma))
-- 
cgit v1.2.3


From 3c381db1fac80373f2cc0d8c1d0bcfbf8bd4fb57 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:16:40 -0800
Subject: mm/page_alloc: count CMA pages per zone and print them in
 /proc/zoneinfo

Let's count the number of CMA pages per zone and print them in
/proc/zoneinfo.

Having access to the total number of CMA pages per zone is helpful for
debugging purposes to know where exactly the CMA pages ended up, and to
figure out how many pages of a zone might behave differently, even after
some of these pages might already have been allocated.

As one example, CMA pages part of a kernel zone cannot be used for
ordinary kernel allocations but instead behave more like ZONE_MOVABLE.

For now, we are only able to get the global nr+free cma pages from
/proc/meminfo and the free cma pages per zone from /proc/zoneinfo.

Example after this patch when booting a 6 GiB QEMU VM with
"hugetlb_cma=2G":
  # cat /proc/zoneinfo | grep cma
          cma      0
        nr_free_cma  0
          cma      0
        nr_free_cma  0
          cma      524288
        nr_free_cma  493016
          cma      0
          cma      0
  # cat /proc/meminfo | grep Cma
  CmaTotal:        2097152 kB
  CmaFree:         1972064 kB

Note: We print even without CONFIG_CMA, just like "nr_free_cma"; this way,
      one can be sure when spotting "cma 0", that there are definetly no
      CMA pages located in a zone.

[david@redhat.com: v2]
  Link: https://lkml.kernel.org/r/20210128164533.18566-1-david@redhat.com
[david@redhat.com: v3]
  Link: https://lkml.kernel.org/r/20210129113451.22085-1-david@redhat.com

Link: https://lkml.kernel.org/r/20210127101813.6370-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 15 +++++++++++++++
 mm/page_alloc.c        |  1 +
 mm/vmstat.c            |  6 ++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9198b7ade85f..5f9c4dad73ed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -503,6 +503,9 @@ struct zone {
 	 * bootmem allocator):
 	 *	managed_pages = present_pages - reserved_pages;
 	 *
+	 * cma pages is present pages that are assigned for CMA use
+	 * (MIGRATE_CMA).
+	 *
 	 * So present_pages may be used by memory hotplug or memory power
 	 * management logic to figure out unmanaged pages by checking
 	 * (present_pages - managed_pages). And managed_pages should be used
@@ -527,6 +530,9 @@ struct zone {
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
+#ifdef CONFIG_CMA
+	unsigned long		cma_pages;
+#endif
 
 	const char		*name;
 
@@ -624,6 +630,15 @@ static inline unsigned long zone_managed_pages(struct zone *zone)
 	return (unsigned long)atomic_long_read(&zone->managed_pages);
 }
 
+static inline unsigned long zone_cma_pages(struct zone *zone)
+{
+#ifdef CONFIG_CMA
+	return zone->cma_pages;
+#else
+	return 0;
+#endif
+}
+
 static inline unsigned long zone_end_pfn(const struct zone *zone)
 {
 	return zone->zone_start_pfn + zone->spanned_pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ddccc59f2f72..3e4b29ee2b1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2168,6 +2168,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
 	}
 
 	adjust_managed_page_count(page, pageblock_nr_pages);
+	page_zone(page)->cma_pages += pageblock_nr_pages;
 }
 #endif
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a0e949542204..6cdf789ced5e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1637,14 +1637,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        high     %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
-		   "\n        managed  %lu",
+		   "\n        managed  %lu"
+		   "\n        cma      %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
 		   zone->spanned_pages,
 		   zone->present_pages,
-		   zone_managed_pages(zone));
+		   zone_managed_pages(zone),
+		   zone_cma_pages(zone));
 
 	seq_printf(m,
 		   "\n        protection: (%ld",
-- 
cgit v1.2.3


From 629484ae73754243917e06d8d5e5f37c26e99399 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 25 Feb 2021 17:16:51 -0800
Subject: mm: vmstat: add some comments on internal storage of byte items

Byte-accounted items are used for slab object accounting at the cgroup
level, because the objects in a slab page can belong to different cgroups.
At the global level these items always change in multiples of whole slab
pages.  The vmstat code exploits this and stores these items as pages
internally, which allows for more compact per-cpu data.

This optimization isn't self-evident from the asserts and the division in
the stat update functions.  Provide the reader with some context.

Link: https://lkml.kernel.org/r/20210202184411.118614-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  6 ++++++
 mm/vmstat.c            | 12 ++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 773135fc6e19..506d625163a1 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -313,6 +313,12 @@ static inline void __mod_node_page_state(struct pglist_data *pgdat,
 			enum node_stat_item item, int delta)
 {
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0b0fc3b77789..e60b36f5f0a9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 	long t;
 
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
@@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat,
 	long o, n, t, z;
 
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
-- 
cgit v1.2.3


From 9f605f260594f99b950062fd62244251e85dbd2b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:16:57 -0800
Subject: mm: move pfn_to_online_page() out of line

Patch series "mm: Fix pfn_to_online_page() with respect to ZONE_DEVICE", v4.

A pfn-walker that uses pfn_to_online_page() may inadvertently translate a
pfn as online and in the page allocator, when it is offline managed by a
ZONE_DEVICE mapping (details in Patch 3: ("mm: Teach pfn_to_online_page()
about ZONE_DEVICE section collisions")).

The 2 proposals under consideration are teach pfn_to_online_page() to be
precise in the presence of mixed-zone sections, or teach the memory-add
code to drop the System RAM associated with ZONE_DEVICE collisions.  In
order to not regress memory capacity by a few 10s to 100s of MiB the
approach taken in this set is to add precision to pfn_to_online_page().

In the course of validating pfn_to_online_page() a couple other fixes
fell out:

1/ soft_offline_page() fails to drop the reference taken in the
   madvise(..., MADV_SOFT_OFFLINE) case.

2/ memory_failure() uses get_dev_pagemap() to lookup ZONE_DEVICE pages,
   however that mapping may contain data pages and metadata raw pfns.
   Introduce pgmap_pfn_valid() to delineate the 2 types and fail the
   handling of raw metadata pfns.

This patch (of 4);

pfn_to_online_page() is already too large to be a macro or an inline
function.  In anticipation of further logic changes / growth, move it out
of line.

No functional change, just code movement.

Link: https://lkml.kernel.org/r/161058499000.1840162.702316708443239771.stgit@dwillia2-desk3.amr.corp.intel.com
Link: https://lkml.kernel.org/r/161058499608.1840162.10165648147615238793.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 17 +----------------
 mm/memory_hotplug.c            | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 15acce5ab106..3d99de0db2dd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,22 +16,7 @@ struct resource;
 struct vmem_altmap;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-/*
- * Return page for the valid pfn only if the page is online. All pfn
- * walkers which rely on the fully initialized page->flags and others
- * should use this rather than pfn_valid && pfn_to_page
- */
-#define pfn_to_online_page(pfn)					   \
-({								   \
-	struct page *___page = NULL;				   \
-	unsigned long ___pfn = pfn;				   \
-	unsigned long ___nr = pfn_to_section_nr(___pfn);	   \
-								   \
-	if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \
-	    pfn_valid_within(___pfn))				   \
-		___page = pfn_to_page(___pfn);			   \
-	___page;						   \
-})
+struct page *pfn_to_online_page(unsigned long pfn);
 
 /*
  * Types for free bootmem stored in page->lru.next. These have to be in
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index abe43c1ae920..fc6cdd99941b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,22 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
 	return 0;
 }
 
+/*
+ * Return page for the valid pfn only if the page is online. All pfn
+ * walkers which rely on the fully initialized page->flags and others
+ * should use this rather than pfn_valid && pfn_to_page
+ */
+struct page *pfn_to_online_page(unsigned long pfn)
+{
+	unsigned long nr = pfn_to_section_nr(pfn);
+
+	if (nr < NR_MEM_SECTIONS && online_section_nr(nr) &&
+	    pfn_valid_within(pfn))
+		return pfn_to_page(pfn);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pfn_to_online_page);
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
-- 
cgit v1.2.3


From 1f90a3477df3ff1a91e064af554cdc887c8f9e5e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:17:05 -0800
Subject: mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions

While pfn_to_online_page() is able to determine pfn_valid() at subsection
granularity it is not able to reliably determine if a given pfn is also
online if the section is mixes ZONE_{NORMAL,MOVABLE} with ZONE_DEVICE.
This means that pfn_to_online_page() may return invalid @page objects.
For example with a memory map like:

100000000-1fbffffff : System RAM
  142000000-143002e16 : Kernel code
  143200000-143713fff : Kernel rodata
  143800000-143b15b7f : Kernel data
  144227000-144ffffff : Kernel bss
1fc000000-2fbffffff : Persistent Memory (legacy)
  1fc000000-2fbffffff : namespace0.0

This command:

echo 0x1fc000000 > /sys/devices/system/memory/soft_offline_page

...succeeds when it should fail.  When it succeeds it touches an
uninitialized page and may crash or cause other damage (see
dissolve_free_huge_page()).

While the memory map above is contrived via the memmap=ss!nn kernel
command line option, the collision happens in practice on shipping
platforms.  The memory controller resources that decode spans of physical
address space are a limited resource.  One technique platform-firmware
uses to conserve those resources is to share a decoder across 2 devices to
keep the address range contiguous.  Unfortunately the unit of operation of
a decoder is 64MiB while the Linux section size is 128MiB.  This results
in situations where, without subsection hotplug memory mappings with
different lifetimes collide into one object that can only express one
lifetime.

Update move_pfn_range_to_zone() to flag (SECTION_TAINT_ZONE_DEVICE) a
section that mixes ZONE_DEVICE pfns with other online pfns.  With
SECTION_TAINT_ZONE_DEVICE to delineate, pfn_to_online_page() can fall back
to a slow-path check for ZONE_DEVICE pfns in an online section.  In the
fast path online_section() for a full ZONE_DEVICE section returns false.

Because the collision case is rare, and for simplicity, the
SECTION_TAINT_ZONE_DEVICE flag is never cleared once set.

[dan.j.williams@intel.com: fix CONFIG_ZONE_DEVICE=n build]
  Link: https://lkml.kernel.org/r/CAPcyv4iX+7LAgAeSqx7Zw-Zd=ZV9gBv8Bo7oTbwCOOqJoZ3+Yg@mail.gmail.com

Link: https://lkml.kernel.org/r/161058500675.1840162.7887862152161279354.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reported-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 34 +++++++++++++++++++++++++++-------
 mm/memory_hotplug.c    | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f9c4dad73ed..47946cec7584 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -918,6 +918,18 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool zone_is_zone_device(struct zone *zone)
+{
+	return zone_idx(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool zone_is_zone_device(struct zone *zone)
+{
+	return false;
+}
+#endif
+
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
@@ -1306,13 +1318,14 @@ extern size_t mem_section_usage_size(void);
  *      which results in PFN_SECTION_SHIFT equal 6.
  * To sum it up, at least 6 bits are available.
  */
-#define	SECTION_MARKED_PRESENT	(1UL<<0)
-#define SECTION_HAS_MEM_MAP	(1UL<<1)
-#define SECTION_IS_ONLINE	(1UL<<2)
-#define SECTION_IS_EARLY	(1UL<<3)
-#define SECTION_MAP_LAST_BIT	(1UL<<4)
-#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT	3
+#define SECTION_MARKED_PRESENT		(1UL<<0)
+#define SECTION_HAS_MEM_MAP		(1UL<<1)
+#define SECTION_IS_ONLINE		(1UL<<2)
+#define SECTION_IS_EARLY		(1UL<<3)
+#define SECTION_TAINT_ZONE_DEVICE	(1UL<<4)
+#define SECTION_MAP_LAST_BIT		(1UL<<5)
+#define SECTION_MAP_MASK		(~(SECTION_MAP_LAST_BIT-1))
+#define SECTION_NID_SHIFT		3
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
@@ -1351,6 +1364,13 @@ static inline int online_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_IS_ONLINE));
 }
 
+static inline int online_device_section(struct mem_section *section)
+{
+	unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
+
+	return section && ((section->section_mem_map & flags) == flags);
+}
+
 static inline int online_section_nr(unsigned long nr)
 {
 	return online_section(__nr_to_section(nr));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 02378f11e2d6..3af4d3851d1a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -308,6 +308,7 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
 struct page *pfn_to_online_page(unsigned long pfn)
 {
 	unsigned long nr = pfn_to_section_nr(pfn);
+	struct dev_pagemap *pgmap;
 	struct mem_section *ms;
 
 	if (nr >= NR_MEM_SECTIONS)
@@ -327,6 +328,22 @@ struct page *pfn_to_online_page(unsigned long pfn)
 	if (!pfn_section_valid(ms, pfn))
 		return NULL;
 
+	if (!online_device_section(ms))
+		return pfn_to_page(pfn);
+
+	/*
+	 * Slowpath: when ZONE_DEVICE collides with
+	 * ZONE_{NORMAL,MOVABLE} within the same section some pfns in
+	 * the section may be 'offline' but 'valid'. Only
+	 * get_dev_pagemap() can determine sub-section online status.
+	 */
+	pgmap = get_dev_pagemap(pfn, NULL);
+	put_dev_pagemap(pgmap);
+
+	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
+	if (pgmap)
+		return NULL;
+
 	return pfn_to_page(pfn);
 }
 EXPORT_SYMBOL_GPL(pfn_to_online_page);
@@ -709,6 +726,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
 
 }
+
+static void section_taint_zone_device(unsigned long pfn)
+{
+	struct mem_section *ms = __pfn_to_section(pfn);
+
+	ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
+}
+
 /*
  * Associate the pfn range with the given zone, initializing the memmaps
  * and resizing the pgdat/zone data to span the added pages. After this
@@ -738,6 +763,19 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
 	pgdat_resize_unlock(pgdat, &flags);
 
+	/*
+	 * Subsection population requires care in pfn_to_online_page().
+	 * Set the taint to enable the slow path detection of
+	 * ZONE_DEVICE pages in an otherwise  ZONE_{NORMAL,MOVABLE}
+	 * section.
+	 */
+	if (zone_is_zone_device(zone)) {
+		if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
+			section_taint_zone_device(start_pfn);
+		if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
+			section_taint_zone_device(start_pfn + nr_pages);
+	}
+
 	/*
 	 * TODO now we have a visible range of pages which are not associated
 	 * with their zone properly. Not nice but set_pfnblock_flags_mask
-- 
cgit v1.2.3


From 34dc45be4563f344d59ba0428416d0d265aa4f4d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:17:08 -0800
Subject: mm: fix memory_failure() handling of dax-namespace metadata

Given 'struct dev_pagemap' spans both data pages and metadata pages be
careful to consult the altmap if present to delineate metadata.  In fact
the pfn_first() helper already identifies the first valid data pfn, so
export that helper for other code paths via pgmap_pfn_valid().

Other usage of get_dev_pagemap() are not a concern because those are
operating on known data pfns having been looked up by get_user_pages().
I.e.  metadata pfns are never user mapped.

Link: https://lkml.kernel.org/r/161058501758.1840162.4239831989762604527.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 6100e34b2526 ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h |  6 ++++++
 mm/memory-failure.c      |  6 ++++++
 mm/memremap.c            | 15 +++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 79c49e7f5c30..f5b464daeeca 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -137,6 +137,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap);
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
@@ -165,6 +166,11 @@ static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 	return NULL;
 }
 
+static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+	return false;
+}
+
 static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 55c671904aac..24210c9bd843 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1312,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		 */
 		put_page(page);
 
+	/* device metadata space is not recoverable */
+	if (!pgmap_pfn_valid(pgmap, pfn)) {
+		rc = -ENXIO;
+		goto out;
+	}
+
 	/*
 	 * Prevent the inode from being freed while we are interrogating
 	 * the address_space, typically this would be handled by
diff --git a/mm/memremap.c b/mm/memremap.c
index 16b2fb482da1..2455bac89506 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
 	return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
 }
 
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; i < pgmap->nr_range; i++) {
+		struct range *range = &pgmap->ranges[i];
+
+		if (pfn >= PHYS_PFN(range->start) &&
+		    pfn <= PHYS_PFN(range->end))
+			return pfn >= pfn_first(pgmap, i);
+	}
+
+	return false;
+}
+
 static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
 {
 	const struct range *range = &pgmap->ranges[range_id];
-- 
cgit v1.2.3


From 1adf8b468ff6bc64ba01ce3848da4bcf409215b4 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 25 Feb 2021 17:17:13 -0800
Subject: mm/memory_hotplug: rename all existing 'memhp' into 'mhp'

This renames all 'memhp' instances to 'mhp' except for memhp_default_state
for being a kernel command line option.  This is just a clean up and
should not cause a functional change.  Let's make it consistent rater than
mixing the two prefixes.  In preparation for more users of the 'mhp'
terminology.

Link: https://lkml.kernel.org/r/1611554093-27316-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 10 +++++-----
 include/linux/memory_hotplug.h |  4 ++--
 mm/memory_hotplug.c            | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index eef4ffb6122c..901e379676be 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = {
 	[MMOP_ONLINE_MOVABLE] = "online_movable",
 };
 
-int memhp_online_type_from_str(const char *str)
+int mhp_online_type_from_str(const char *str)
 {
 	int i;
 
@@ -253,7 +253,7 @@ static int memory_subsys_offline(struct device *dev)
 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t count)
 {
-	const int online_type = memhp_online_type_from_str(buf);
+	const int online_type = mhp_online_type_from_str(buf);
 	struct memory_block *mem = to_memory_block(dev);
 	int ret;
 
@@ -387,19 +387,19 @@ static ssize_t auto_online_blocks_show(struct device *dev,
 				       struct device_attribute *attr, char *buf)
 {
 	return sysfs_emit(buf, "%s\n",
-			  online_type_to_str[memhp_default_online_type]);
+			  online_type_to_str[mhp_default_online_type]);
 }
 
 static ssize_t auto_online_blocks_store(struct device *dev,
 					struct device_attribute *attr,
 					const char *buf, size_t count)
 {
-	const int online_type = memhp_online_type_from_str(buf);
+	const int online_type = mhp_online_type_from_str(buf);
 
 	if (online_type < 0)
 		return -EINVAL;
 
-	memhp_default_online_type = online_type;
+	mhp_default_online_type = online_type;
 	return count;
 }
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3d99de0db2dd..ca5e8d137726 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -116,10 +116,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
 			   struct mhp_params *params);
 extern u64 max_mem_size;
 
-extern int memhp_online_type_from_str(const char *str);
+extern int mhp_online_type_from_str(const char *str);
 
 /* Default online_type (MMOP_*) when new memory blocks are added. */
-extern int memhp_default_online_type;
+extern int mhp_default_online_type;
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
 static inline bool movable_node_is_enabled(void)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3af4d3851d1a..ac1c686a5989 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,17 +67,17 @@ void put_online_mems(void)
 bool movable_node_enabled = false;
 
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int memhp_default_online_type = MMOP_OFFLINE;
+int mhp_default_online_type = MMOP_OFFLINE;
 #else
-int memhp_default_online_type = MMOP_ONLINE;
+int mhp_default_online_type = MMOP_ONLINE;
 #endif
 
 static int __init setup_memhp_default_state(char *str)
 {
-	const int online_type = memhp_online_type_from_str(str);
+	const int online_type = mhp_online_type_from_str(str);
 
 	if (online_type >= 0)
-		memhp_default_online_type = online_type;
+		mhp_default_online_type = online_type;
 
 	return 1;
 }
@@ -1076,7 +1076,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
 
 static int online_memory_block(struct memory_block *mem, void *arg)
 {
-	mem->online_type = memhp_default_online_type;
+	mem->online_type = mhp_default_online_type;
 	return device_online(&mem->dev);
 }
 
@@ -1157,7 +1157,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		merge_system_ram_resource(res);
 
 	/* online pages if requested */
-	if (memhp_default_online_type != MMOP_OFFLINE)
+	if (mhp_default_online_type != MMOP_OFFLINE)
 		walk_memory_blocks(start, size, NULL, online_memory_block);
 
 	return ret;
-- 
cgit v1.2.3


From 26011267e1a7ddaab50b5f81b402ca3e7fc2887c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:17:17 -0800
Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE -> MHP_MERGE_RESOURCE

Let's make "MEMHP_MERGE_RESOURCE" consistent with "MHP_NONE", "mhp_t" and
"mhp_flags".  As discussed recently [1], "mhp" is our internal acronym for
memory hotplug now.

[1] https://lore.kernel.org/linux-mm/c37de2d0-28a1-4f7d-f944-cfd7d81c334d@redhat.com/

Link: https://lkml.kernel.org/r/20210126115829.10909-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Reviewed-by: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hv/hv_balloon.c        | 2 +-
 drivers/virtio/virtio_mem.c    | 2 +-
 drivers/xen/balloon.c          | 2 +-
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 8c471823a5af..2f776d78e3c1 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 
 		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
 		ret = add_memory(nid, PFN_PHYS((start_pfn)),
-				(HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
+				(HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
 
 		if (ret) {
 			pr_err("hot_add memory failed error is %d\n", ret);
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 9fc9ec4a25f5..d44e43869f17 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -623,7 +623,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 	/* Memory might get onlined immediately. */
 	atomic64_add(size, &vm->offline_size);
 	rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
-				       MEMHP_MERGE_RESOURCE);
+				       MHP_MERGE_RESOURCE);
 	if (rc) {
 		atomic64_sub(size, &vm->offline_size);
 		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index b57b2067ecbf..671c71245a7b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void)
 	mutex_unlock(&balloon_mutex);
 	/* add_memory_resource() requires the device_hotplug lock */
 	lock_device_hotplug();
-	rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE);
+	rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE);
 	unlock_device_hotplug();
 	mutex_lock(&balloon_mutex);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ca5e8d137726..08eeef679ab7 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -53,7 +53,7 @@ typedef int __bitwise mhp_t;
  * with this flag set, the resource pointer must no longer be used as it
  * might be stale, or the resource might have changed.
  */
-#define MEMHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
+#define MHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
 
 /*
  * Extended parameters for memory hotplug:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ac1c686a5989..6a02c3f42717 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1153,7 +1153,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 * In case we're allowed to merge the resource, flag it and trigger
 	 * merging now that adding succeeded.
 	 */
-	if (mhp_flags & MEMHP_MERGE_RESOURCE)
+	if (mhp_flags & MHP_MERGE_RESOURCE)
 		merge_system_ram_resource(res);
 
 	/* online pages if requested */
-- 
cgit v1.2.3


From e9a2e48e8704c9d20a625c6f2357147d03ea7b97 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:17:24 -0800
Subject: drivers/base/memory: don't store phys_device in memory blocks

No need to store the value for each and every memory block, as we can
easily query the value at runtime.  Reshuffle the members to optimize the
memory layout.  Also, let's clarify what the interface once was used for
and why it's legacy nowadays.

"phys_device" was used on s390x in older versions of lsmem[2]/chmem[3],
back when they were still part of s390x-tools.  They were later replaced
by the variants in linux-utils.  For example, RHEL6 and RHEL7 contain
lsmem/chmem from s390-utils.  RHEL8 switched to versions from util-linux
on s390x [4].

"phys_device" was added with sysfs support for memory hotplug in commit
3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") in
2005.  It always returned 0.

s390x started returning something != 0 on some setups (if sclp.rzm is set
by HW) in 2010 via commit 57b552ba0b2f ("memory hotplug/s390: set
phys_device").

For s390x, it allowed for identifying which memory block devices belong to
the same storage increment (RZM).  Only if all memory block devices
comprising a single storage increment were offline, the memory could
actually be removed in the hypervisor.

Since commit e5d709bb5fb7 ("s390/memory hotplug: provide
memory_block_size_bytes() function") in 2013 a memory block device spans
at least one storage increment - which is why the interface isn't really
helpful/used anymore (except by old lsmem/chmem tools).

There were once RFC patches to make use of "phys_device" in ACPI context;
however, the underlying problem could be solved using different interfaces
[1].

[1] https://patchwork.kernel.org/patch/2163871/
[2] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/lsmem
[3] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/chmem
[4] https://bugzilla.redhat.com/show_bug.cgi?id=1504134

Link: https://lkml.kernel.org/r/20210201181347.13262-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-devices-memory  |  5 +++--
 Documentation/admin-guide/mm/memory-hotplug.rst |  4 ++--
 drivers/base/memory.c                           | 25 +++++++++----------------
 include/linux/memory.h                          |  3 +--
 4 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index 246a45b96d22..58dbc592bc57 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -26,8 +26,9 @@ Date:		September 2008
 Contact:	Badari Pulavarty <pbadari@us.ibm.com>
 Description:
 		The file /sys/devices/system/memory/memoryX/phys_device
-		is read-only and is designed to show the name of physical
-		memory device.  Implementation is currently incomplete.
+		is read-only;  it is a legacy interface only ever used on s390x
+		to expose the covered storage increment.
+Users:		Legacy s390-tools lsmem/chmem
 
 What:		/sys/devices/system/memory/memoryX/phys_index
 Date:		September 2008
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 5c4432c96c4b..245739f55ac7 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -160,8 +160,8 @@ Under each memory block, you can see 5 files:
 
                     "online_movable", "online", "offline" command
                     which will be performed on all sections in the block.
-``phys_device``     read-only: designed to show the name of physical memory
-                    device.  This is not well implemented now.
+``phys_device``	    read-only: legacy interface only ever used on s390x to
+		    expose the covered storage increment.
 ``removable``       read-only: contains an integer value indicating
                     whether the memory block is removable or not
                     removable.  A value of 1 indicates that the memory
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 901e379676be..f35298425575 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -290,20 +290,20 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 }
 
 /*
- * phys_device is a bad name for this.  What I really want
- * is a way to differentiate between memory ranges that
- * are part of physical devices that constitute
- * a complete removable unit or fru.
- * i.e. do these ranges belong to the same physical device,
- * s.t. if I offline all of these sections I can then
- * remove the physical device?
+ * Legacy interface that we cannot remove: s390x exposes the storage increment
+ * covered by a memory block, allowing for identifying which memory blocks
+ * comprise a storage increment. Since a memory block spans complete
+ * storage increments nowadays, this interface is basically unused. Other
+ * archs never exposed != 0.
  */
 static ssize_t phys_device_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 
-	return sysfs_emit(buf, "%d\n", mem->phys_device);
+	return sysfs_emit(buf, "%d\n",
+			  arch_get_memory_phys_device(start_pfn));
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
@@ -488,11 +488,7 @@ static DEVICE_ATTR_WO(soft_offline_page);
 static DEVICE_ATTR_WO(hard_offline_page);
 #endif
 
-/*
- * Note that phys_device is optional.  It is here to allow for
- * differentiation between which *physical* devices each
- * section belongs to...
- */
+/* See phys_device_show(). */
 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 {
 	return 0;
@@ -574,7 +570,6 @@ int register_memory(struct memory_block *memory)
 static int init_memory_block(unsigned long block_id, unsigned long state)
 {
 	struct memory_block *mem;
-	unsigned long start_pfn;
 	int ret = 0;
 
 	mem = find_memory_block_by_id(block_id);
@@ -588,8 +583,6 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
 
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
-	start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	mem->phys_device = arch_get_memory_phys_device(start_pfn);
 	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 439a89e758d8..4da95e684e20 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -27,9 +27,8 @@ struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
 	int online_type;		/* for passing data to online routine */
-	int phys_device;		/* to which fru does this belong? */
-	struct device dev;
 	int nid;			/* NID for this memory block */
+	struct device dev;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
-- 
cgit v1.2.3


From bca3feaa0764ab5a4cbe6817871601f1d00c059d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 25 Feb 2021 17:17:33 -0800
Subject: mm/memory_hotplug: prevalidate the address range being added with
 platform

Patch series "mm/memory_hotplug: Pre-validate the address range with platform", v5.

This series adds a mechanism allowing platforms to weigh in and
prevalidate incoming address range before proceeding further with the
memory hotplug.  This helps prevent potential platform errors for the
given address range, down the hotplug call chain, which inevitably fails
the hotplug itself.

This mechanism was suggested by David Hildenbrand during another
discussion with respect to a memory hotplug fix on arm64 platform.

https://lore.kernel.org/linux-arm-kernel/1600332402-30123-1-git-send-email-anshuman.khandual@arm.com/

This mechanism focuses on the addressibility aspect and not [sub] section
alignment aspect.  Hence check_hotplug_memory_range() and check_pfn_span()
have been left unchanged.

This patch (of 4):

This introduces mhp_range_allowed() which can be called in various memory
hotplug paths to prevalidate the address range which is being added, with
the platform.  Then mhp_range_allowed() calls mhp_get_pluggable_range()
which provides applicable address range depending on whether linear
mapping is required or not.  For ranges that require linear mapping, it
calls a new arch callback arch_get_mappable_range() which the platform can
override.  So the new callback, in turn provides the platform an
opportunity to configure acceptable memory hotplug address ranges in case
there are constraints.

This mechanism will help prevent platform specific errors deep down during
hotplug calls.  This drops now redundant
check_hotplug_memory_addressable() check in __add_pages() but instead adds
a VM_BUG_ON() check which would ensure that the range has been validated
with mhp_range_allowed() earlier in the call chain.  Besides
mhp_get_pluggable_range() also can be used by potential memory hotplug
callers to avail the allowed physical range which would go through on a
given platform.

This does not really add any new range check in generic memory hotplug but
instead compensates for lost checks in arch_add_memory() where applicable
and check_hotplug_memory_addressable(), with unified mhp_range_allowed().

[akpm@linux-foundation.org: make pagemap_range() return -EINVAL when mhp_range_allowed() fails]

Link: https://lkml.kernel.org/r/1612149902-7867-1-git-send-email-anshuman.khandual@arm.com
Link: https://lkml.kernel.org/r/1612149902-7867-2-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com> # s390
Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 10 ++++++
 mm/memory_hotplug.c            | 78 ++++++++++++++++++++++++++++++++----------
 mm/memremap.c                  |  8 ++++-
 3 files changed, 76 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 08eeef679ab7..7288aa5ef73b 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -66,6 +66,9 @@ struct mhp_params {
 	pgprot_t pgprot;
 };
 
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
+struct range mhp_get_pluggable_range(bool need_mapping);
+
 /*
  * Zone resizing functions
  *
@@ -266,6 +269,13 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
+/*
+ * Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some
+ * platforms might override and use arch_get_mappable_range()
+ * for internal non memory hotplug purposes.
+ */
+struct range arch_get_mappable_range(void);
+
 #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
 /*
  * pgdat resizing functions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a969463bdda4..5ba51a8bdaeb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size,
 	if (strcmp(resource_name, "System RAM"))
 		flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
 
+	if (!mhp_range_allowed(start, size, true))
+		return ERR_PTR(-E2BIG);
+
 	/*
 	 * Make sure value parsed from 'mem=' only restricts memory adding
 	 * while booting, so that memory hotplug won't be impacted. Please
@@ -284,22 +287,6 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 	return 0;
 }
 
-static int check_hotplug_memory_addressable(unsigned long pfn,
-					    unsigned long nr_pages)
-{
-	const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
-
-	if (max_addr >> MAX_PHYSMEM_BITS) {
-		const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
-		WARN(1,
-		     "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
-		     (u64)PFN_PHYS(pfn), max_addr, max_allowed);
-		return -E2BIG;
-	}
-
-	return 0;
-}
-
 /*
  * Return page for the valid pfn only if the page is online. All pfn
  * walkers which rely on the fully initialized page->flags and others
@@ -365,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	if (WARN_ON_ONCE(!params->pgprot.pgprot))
 		return -EINVAL;
 
-	err = check_hotplug_memory_addressable(pfn, nr_pages);
-	if (err)
-		return err;
+	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
 
 	if (altmap) {
 		/*
@@ -1248,6 +1233,61 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(add_memory_driver_managed);
 
+/*
+ * Platforms should define arch_get_mappable_range() that provides
+ * maximum possible addressable physical memory range for which the
+ * linear mapping could be created. The platform returned address
+ * range must adhere to these following semantics.
+ *
+ * - range.start <= range.end
+ * - Range includes both end points [range.start..range.end]
+ *
+ * There is also a fallback definition provided here, allowing the
+ * entire possible physical address range in case any platform does
+ * not define arch_get_mappable_range().
+ */
+struct range __weak arch_get_mappable_range(void)
+{
+	struct range mhp_range = {
+		.start = 0UL,
+		.end = -1ULL,
+	};
+	return mhp_range;
+}
+
+struct range mhp_get_pluggable_range(bool need_mapping)
+{
+	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+	struct range mhp_range;
+
+	if (need_mapping) {
+		mhp_range = arch_get_mappable_range();
+		if (mhp_range.start > max_phys) {
+			mhp_range.start = 0;
+			mhp_range.end = 0;
+		}
+		mhp_range.end = min_t(u64, mhp_range.end, max_phys);
+	} else {
+		mhp_range.start = 0;
+		mhp_range.end = max_phys;
+	}
+	return mhp_range;
+}
+EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
+
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
+{
+	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
+	u64 end = start + size;
+
+	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
+		return true;
+
+	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
+		start, end, mhp_range.start, mhp_range.end);
+	return false;
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * Confirm all pages in a range [start, end) belong to the same zone (skipping
diff --git a/mm/memremap.c b/mm/memremap.c
index 2455bac89506..7aa7d6e80ee5 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -200,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
 static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 		int range_id, int nid)
 {
+	const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE;
 	struct range *range = &pgmap->ranges[range_id];
 	struct dev_pagemap *conflict_pgmap;
 	int error, is_ram;
@@ -245,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	if (error)
 		goto err_pfn_remap;
 
+	if (!mhp_range_allowed(range->start, range_len(range), !is_private)) {
+		error = -EINVAL;
+		goto err_pfn_remap;
+	}
+
 	mem_hotplug_begin();
 
 	/*
@@ -258,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	 * the CPU, we do want the linear mapping and thus use
 	 * arch_add_memory().
 	 */
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+	if (is_private) {
 		error = add_pages(nid, PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), params);
 	} else {
-- 
cgit v1.2.3


From 5d5d19eda6b0ee790af89c45e3f678345be6f50f Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 25 Feb 2021 17:18:09 -0800
Subject: mm/rmap: fix potential pte_unmap on an not mapped pte

For PMD-mapped page (usually THP), pvmw->pte is NULL.  For PTE-mapped THP,
pvmw->pte is mapped.  But for HugeTLB pages, pvmw->pte is not mapped and
set to the relevant page table entry.  So in page_vma_mapped_walk_done(),
we may do pte_unmap() for HugeTLB pte which is not mapped.  Fix this by
checking pvmw->page against PageHuge before trying to do pte_unmap().

Link: https://lkml.kernel.org/r/20210127093349.39081-1-linmiaohe@huawei.com
Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()")
Signed-off-by: Hongxiang Lou <louhongxiang@huawei.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michel Lespinasse <walken@google.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 70085ca1a3fc..def5c62c93b3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -213,7 +213,8 @@ struct page_vma_mapped_walk {
 
 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 {
-	if (pvmw->pte)
+	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
+	if (pvmw->pte && !PageHuge(pvmw->page))
 		pte_unmap(pvmw->pte);
 	if (pvmw->ptl)
 		spin_unlock(pvmw->ptl);
-- 
cgit v1.2.3


From fc6697a89f56d9773b2fbff718d4cf2a6d63379d Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Thu, 25 Feb 2021 17:18:17 -0800
Subject: mm/zswap: add the flag can_sleep_mapped

Patch series "Fix the compatibility of zsmalloc and zswap".

Patch #1 adds a flag to zpool, then zswap used to determine if zpool
drivers such as zbud/z3fold/zsmalloc will enter an atomic context after
mapping.

The difference between zbud/z3fold and zsmalloc is that zsmalloc requires
an atomic context that since its map function holds a preempt-disabled,
but zbud/z3fold don't require an atomic context.  So patch #2 sets flag
sleep_mapped to true indicating that zbud/z3fold can sleep after mapping.
zsmalloc didn't support sleep after mapping, so don't set that flag to
true.

This patch (of 2):

Add a flag to zpool, named is "can_sleep_mapped", and have it set true for
zbud/z3fold, not set this flag for zsmalloc, so its default value is
false.  Then zswap could go the current path if the flag is true; and if
it's false, copy data from src to a temporary buffer, then unmap the
handle, take the mutex, process the buffer instead of src to avoid
sleeping function called from atomic context.

[natechancellor@gmail.com: add return value in zswap_frontswap_load]
  Link: https://lkml.kernel.org/r/20210121214804.926843-1-natechancellor@gmail.com
[tiantao6@hisilicon.com: fix potential memory leak]
  Link: https://lkml.kernel.org/r/1611538365-51811-1-git-send-email-tiantao6@hisilicon.com
[colin.king@canonical.com: fix potential uninitialized pointer read on tmp]
  Link: https://lkml.kernel.org/r/20210128141728.639030-1-colin.king@canonical.com
[tiantao6@hisilicon.com: fix variable 'entry' is uninitialized when used]
  Link: https://lkml.kernel.org/r/1611223030-58346-1-git-send-email-tiantao6@hisilicon.comLink: https://lkml.kernel.org/r/1611035683-12732-1-git-send-email-tiantao6@hisilicon.com

Link: https://lkml.kernel.org/r/1611035683-12732-2-git-send-email-tiantao6@hisilicon.com
Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Vitaly Wool <vitaly.wool@konsulko.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zpool.h |  3 +++
 mm/zpool.c            | 13 +++++++++++++
 mm/zswap.c            | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 51bf43076165..e8997010612a 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool);
  * @malloc:	allocate mem from a pool.
  * @free:	free mem from a pool.
  * @shrink:	shrink the pool.
+ * @sleep_mapped: whether zpool driver can sleep during map.
  * @map:	map a handle.
  * @unmap:	unmap a handle.
  * @total_size:	get total size of a pool.
@@ -100,6 +101,7 @@ struct zpool_driver {
 	int (*shrink)(void *pool, unsigned int pages,
 				unsigned int *reclaimed);
 
+	bool sleep_mapped;
 	void *(*map)(void *pool, unsigned long handle,
 				enum zpool_mapmode mm);
 	void (*unmap)(void *pool, unsigned long handle);
@@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver);
 int zpool_unregister_driver(struct zpool_driver *driver);
 
 bool zpool_evictable(struct zpool *pool);
+bool zpool_can_sleep_mapped(struct zpool *pool);
 
 #endif
diff --git a/mm/zpool.c b/mm/zpool.c
index 3744a2d1a624..5ed71207ced7 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -23,6 +23,7 @@ struct zpool {
 	void *pool;
 	const struct zpool_ops *ops;
 	bool evictable;
+	bool can_sleep_mapped;
 
 	struct list_head list;
 };
@@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
 	zpool->pool = driver->create(name, gfp, ops, zpool);
 	zpool->ops = ops;
 	zpool->evictable = driver->shrink && ops && ops->evict;
+	zpool->can_sleep_mapped = driver->sleep_mapped;
 
 	if (!zpool->pool) {
 		pr_err("couldn't create %s pool\n", type);
@@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool)
 	return zpool->evictable;
 }
 
+/**
+ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
+ * @zpool:	The zpool to test
+ *
+ * Returns: true if zpool can sleep; false otherwise.
+ */
+bool zpool_can_sleep_mapped(struct zpool *zpool)
+{
+	return zpool->can_sleep_mapped;
+}
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
 MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zswap.c b/mm/zswap.c
index 1e41c2857068..578d9f256920 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 
-	u8 *src;
+	u8 *src, *tmp = NULL;
 	unsigned int dlen;
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
 	};
 
+	if (!zpool_can_sleep_mapped(pool)) {
+		tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+		if (!tmp)
+			return -ENOMEM;
+	}
+
 	/* extract swpentry from data */
 	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
 	swpentry = zhdr->swpentry; /* here */
@@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		/* entry was invalidated */
 		spin_unlock(&tree->lock);
 		zpool_unmap_handle(pool, handle);
+		kfree(tmp);
 		return 0;
 	}
 	spin_unlock(&tree->lock);
@@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		dlen = PAGE_SIZE;
 		src = (u8 *)zhdr + sizeof(struct zswap_header);
 
+		if (!zpool_can_sleep_mapped(pool)) {
+
+			memcpy(tmp, src, entry->length);
+			src = tmp;
+
+			zpool_unmap_handle(pool, handle);
+		}
+
 		mutex_lock(acomp_ctx->mutex);
 		sg_init_one(&input, src, entry->length);
 		sg_init_table(&output, 1);
@@ -1033,7 +1048,11 @@ fail:
 	spin_unlock(&tree->lock);
 
 end:
-	zpool_unmap_handle(pool, handle);
+	if (zpool_can_sleep_mapped(pool))
+		zpool_unmap_handle(pool, handle);
+	else
+		kfree(tmp);
+
 	return ret;
 }
 
@@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	struct zswap_entry *entry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src, *dst;
+	u8 *src, *dst, *tmp;
 	unsigned int dlen;
 	int ret;
 
@@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		dst = kmap_atomic(page);
 		zswap_fill_page(dst, entry->value);
 		kunmap_atomic(dst);
+		ret = 0;
 		goto freeentry;
 	}
 
+	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+		tmp = kmalloc(entry->length, GFP_ATOMIC);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto freeentry;
+		}
+	}
+
 	/* decompress */
 	dlen = PAGE_SIZE;
 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
 	if (zpool_evictable(entry->pool->zpool))
 		src += sizeof(struct zswap_header);
 
+	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+		memcpy(tmp, src, entry->length);
+		src = tmp;
+
+		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	}
+
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(acomp_ctx->mutex);
 	sg_init_one(&input, src, entry->length);
@@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(entry->pool->zpool))
+		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	else
+		kfree(tmp);
+
 	BUG_ON(ret);
 
 freeentry:
@@ -1279,7 +1320,7 @@ freeentry:
 	zswap_entry_put(tree, entry);
 	spin_unlock(&tree->lock);
 
-	return 0;
+	return ret;
 }
 
 /* frees an entry in zswap */
-- 
cgit v1.2.3


From 2395928158059b8f9858365fce7713ce7fef62e4 Mon Sep 17 00:00:00 2001
From: Rokudo Yan <wu-yan@tcl.com>
Date: Thu, 25 Feb 2021 17:18:31 -0800
Subject: zsmalloc: account the number of compacted pages correctly

There exists multiple path may do zram compaction concurrently.
1. auto-compaction triggered during memory reclaim
2. userspace utils write zram<id>/compaction node

So, multiple threads may call zs_shrinker_scan/zs_compact concurrently.
But pages_compacted is a per zsmalloc pool variable and modification
of the variable is not serialized(through under class->lock).
There are two issues here:
1. the pages_compacted may not equal to total number of pages
freed(due to concurrently add).
2. zs_shrinker_scan may not return the correct number of pages
freed(issued by current shrinker).

The fix is simple:
1. account the number of pages freed in zs_compact locally.
2. use actomic variable pages_compacted to accumulate total number.

Link: https://lkml.kernel.org/r/20210202122235.26885-1-wu-yan@tcl.com
Fixes: 860c707dca155a56 ("zsmalloc: account the number of compacted pages")
Signed-off-by: Rokudo Yan <wu-yan@tcl.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  2 +-
 include/linux/zsmalloc.h      |  2 +-
 mm/zsmalloc.c                 | 17 +++++++++++------
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d7018543842e..a711a2e2a794 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1081,7 +1081,7 @@ static ssize_t mm_stat_show(struct device *dev,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.same_pages),
-			pool_stats.pages_compacted,
+			atomic_long_read(&pool_stats.pages_compacted),
 			(u64)atomic64_read(&zram->stats.huge_pages),
 			(u64)atomic64_read(&zram->stats.huge_pages_since));
 	up_read(&zram->init_lock);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 4807ca4d52e0..2a430e713ce5 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -35,7 +35,7 @@ enum zs_mapmode {
 
 struct zs_pool_stats {
 	/* How many pages were migrated (freed) */
-	unsigned long pages_compacted;
+	atomic_long_t pages_compacted;
 };
 
 struct zs_pool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index cf0ed0e4e911..1518732f95c3 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2212,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class)
 	return obj_wasted * class->pages_per_zspage;
 }
 
-static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+static unsigned long __zs_compact(struct zs_pool *pool,
+				  struct size_class *class)
 {
 	struct zs_compact_control cc;
 	struct zspage *src_zspage;
 	struct zspage *dst_zspage = NULL;
+	unsigned long pages_freed = 0;
 
 	spin_lock(&class->lock);
 	while ((src_zspage = isolate_zspage(class, true))) {
@@ -2246,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 		putback_zspage(class, dst_zspage);
 		if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
 			free_zspage(pool, class, src_zspage);
-			pool->stats.pages_compacted += class->pages_per_zspage;
+			pages_freed += class->pages_per_zspage;
 		}
 		spin_unlock(&class->lock);
 		cond_resched();
@@ -2257,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 		putback_zspage(class, src_zspage);
 
 	spin_unlock(&class->lock);
+
+	return pages_freed;
 }
 
 unsigned long zs_compact(struct zs_pool *pool)
 {
 	int i;
 	struct size_class *class;
+	unsigned long pages_freed = 0;
 
 	for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
 		class = pool->size_class[i];
@@ -2270,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool)
 			continue;
 		if (class->index != i)
 			continue;
-		__zs_compact(pool, class);
+		pages_freed += __zs_compact(pool, class);
 	}
+	atomic_long_add(pages_freed, &pool->stats.pages_compacted);
 
-	return pool->stats.pages_compacted;
+	return pages_freed;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
 
@@ -2290,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
 	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
 			shrinker);
 
-	pages_freed = pool->stats.pages_compacted;
 	/*
 	 * Compact classes and calculate compaction delta.
 	 * Can run concurrently with a manually triggered
 	 * (by user) compaction.
 	 */
-	pages_freed = zs_compact(pool) - pages_freed;
+	pages_freed = zs_compact(pool);
 
 	return pages_freed ? pages_freed : SHRINK_STOP;
 }
-- 
cgit v1.2.3


From 4be408cec257d1156d35647db57726f5ef977630 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Thu, 25 Feb 2021 17:18:38 -0800
Subject: mm: page-flags.h: Typo fix (It -> If)

The "If" was wrongly spelled as "It".

Link: https://lkml.kernel.org/r/1608959036-91409-1-git-send-email-guoren@kernel.org
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Steven Price <steven.price@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index db914477057b..04a34c08e0a6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -810,7 +810,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
- * these flags set.  It they are, there is a problem.
+ * these flags set.  If they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE				\
 	(1UL << PG_lru		| 1UL << PG_locked	|	\
@@ -821,7 +821,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have these flags set.  It they are set,
+ * Pages being prepped should not have these flags set.  If they are set,
  * there has been a kernel bug or struct page corruption.
  *
  * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
-- 
cgit v1.2.3


From 0ce20dd840897b12ae70869c69f1ba34d6d16965 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:18:53 -0800
Subject: mm: add Kernel Electric-Fence infrastructure

Patch series "KFENCE: A low-overhead sampling-based memory safety error detector", v7.

This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors.  This
series enables KFENCE for the x86 and arm64 architectures, and adds
KFENCE hooks to the SLAB and SLUB allocators.

KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.

KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error.

Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval,
the next allocation through the main allocator (SLAB or SLUB) returns a
guarded allocation from the KFENCE object pool. At this point, the timer
is reset, and the next allocation is set up after the expiration of the
interval.

To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE.

The KFENCE memory pool is of fixed size, and if the pool is exhausted no
further KFENCE allocations occur. The default config is conservative
with only 255 objects, resulting in a pool size of 2 MiB (with 4 KiB
pages).

We have verified by running synthetic benchmarks (sysbench I/O,
hackbench) and production server-workload benchmarks that a kernel with
KFENCE (using sample intervals 100-500ms) is performance-neutral
compared to a non-KFENCE baseline kernel.

KFENCE is inspired by GWP-ASan [1], a userspace tool with similar
properties. The name "KFENCE" is a homage to the Electric Fence Malloc
Debugger [2].

For more details, see Documentation/dev-tools/kfence.rst added in the
series -- also viewable here:

	https://raw.githubusercontent.com/google/kasan/kfence/Documentation/dev-tools/kfence.rst

[1] http://llvm.org/docs/GwpAsan.html
[2] https://linux.die.net/man/3/efence

This patch (of 9):

This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors.

KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.

KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error. To detect out-of-bounds
writes to memory within the object's page itself, KFENCE also uses
pattern-based redzones. The following figure illustrates the page
layout:

  ---+-----------+-----------+-----------+-----------+-----------+---
     | xxxxxxxxx | O :       | xxxxxxxxx |       : O | xxxxxxxxx |
     | xxxxxxxxx | B :       | xxxxxxxxx |       : B | xxxxxxxxx |
     | x GUARD x | J : RED-  | x GUARD x | RED-  : J | x GUARD x |
     | xxxxxxxxx | E :  ZONE | xxxxxxxxx |  ZONE : E | xxxxxxxxx |
     | xxxxxxxxx | C :       | xxxxxxxxx |       : C | xxxxxxxxx |
     | xxxxxxxxx | T :       | xxxxxxxxx |       : T | xxxxxxxxx |
  ---+-----------+-----------+-----------+-----------+-----------+---

Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval, a
guarded allocation from the KFENCE object pool is returned to the main
allocator (SLAB or SLUB). At this point, the timer is reset, and the
next allocation is set up after the expiration of the interval.

To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE. To date, we have verified by running synthetic
benchmarks (sysbench I/O, hackbench) that a kernel compiled with KFENCE
is performance-neutral compared to the non-KFENCE baseline.

For more details, see Documentation/dev-tools/kfence.rst (added later in
the series).

[elver@google.com: fix parameter description for kfence_object_start()]
  Link: https://lkml.kernel.org/r/20201106092149.GA2851373@elver.google.com
[elver@google.com: avoid stalling work queue task without allocations]
  Link: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com
  Link: https://lkml.kernel.org/r/20201110135320.3309507-1-elver@google.com
[elver@google.com: fix potential deadlock due to wake_up()]
  Link: https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com
  Link: https://lkml.kernel.org/r/20210104130749.1768991-1-elver@google.com
[elver@google.com: add option to use KFENCE without static keys]
  Link: https://lkml.kernel.org/r/20210111091544.3287013-1-elver@google.com
[elver@google.com: add missing copyright and description headers]
  Link: https://lkml.kernel.org/r/20210118092159.145934-1-elver@google.com

Link: https://lkml.kernel.org/r/20201103175841.3495947-2-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: SeongJae Park <sjpark@amazon.de>
Co-developed-by: Marco Elver <elver@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfence.h | 216 +++++++++++++
 init/main.c            |   3 +
 lib/Kconfig.debug      |   1 +
 lib/Kconfig.kfence     |  67 ++++
 mm/Makefile            |   1 +
 mm/kfence/Makefile     |   3 +
 mm/kfence/core.c       | 840 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kfence/kfence.h     | 113 +++++++
 mm/kfence/report.c     | 240 ++++++++++++++
 9 files changed, 1484 insertions(+)
 create mode 100644 include/linux/kfence.h
 create mode 100644 lib/Kconfig.kfence
 create mode 100644 mm/kfence/Makefile
 create mode 100644 mm/kfence/core.c
 create mode 100644 mm/kfence/kfence.h
 create mode 100644 mm/kfence/report.c

(limited to 'include')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
new file mode 100644
index 000000000000..81f3911cb298
--- /dev/null
+++ b/include/linux/kfence.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault
+ * handler integration. For more info see Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef _LINUX_KFENCE_H
+#define _LINUX_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KFENCE
+
+/*
+ * We allocate an even number of pages, as it simplifies calculations to map
+ * address to metadata indices; effectively, the very first page serves as an
+ * extended guard page, but otherwise has no special purpose.
+ */
+#define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)
+extern char *__kfence_pool;
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+#include <linux/static_key.h>
+DECLARE_STATIC_KEY_FALSE(kfence_allocation_key);
+#else
+#include <linux/atomic.h>
+extern atomic_t kfence_allocation_gate;
+#endif
+
+/**
+ * is_kfence_address() - check if an address belongs to KFENCE pool
+ * @addr: address to check
+ *
+ * Return: true or false depending on whether the address is within the KFENCE
+ * object range.
+ *
+ * KFENCE objects live in a separate page range and are not to be intermixed
+ * with regular heap objects (e.g. KFENCE objects must never be added to the
+ * allocator freelists). Failing to do so may and will result in heap
+ * corruptions, therefore is_kfence_address() must be used to check whether
+ * an object requires specific handling.
+ *
+ * Note: This function may be used in fast-paths, and is performance critical.
+ * Future changes should take this into account; for instance, we want to avoid
+ * introducing another load and therefore need to keep KFENCE_POOL_SIZE a
+ * constant (until immediate patching support is added to the kernel).
+ */
+static __always_inline bool is_kfence_address(const void *addr)
+{
+	/*
+	 * The non-NULL check is required in case the __kfence_pool pointer was
+	 * never initialized; keep it in the slow-path after the range-check.
+	 */
+	return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+}
+
+/**
+ * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ */
+void __init kfence_alloc_pool(void);
+
+/**
+ * kfence_init() - perform KFENCE initialization at boot time
+ *
+ * Requires that kfence_alloc_pool() was called before. This sets up the
+ * allocation gate timer, and requires that workqueues are available.
+ */
+void __init kfence_init(void);
+
+/**
+ * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects
+ * @s: cache being shut down
+ *
+ * Before shutting down a cache, one must ensure there are no remaining objects
+ * allocated from it. Because KFENCE objects are not referenced from the cache
+ * directly, we need to check them here.
+ *
+ * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does
+ * not return if allocated objects still exist: it prints an error message and
+ * simply aborts destruction of a cache, leaking memory.
+ *
+ * If the only such objects are KFENCE objects, we will not leak the entire
+ * cache, but instead try to provide more useful debug info by making allocated
+ * objects "zombie allocations". Objects may then still be used or freed (which
+ * is handled gracefully), but usage will result in showing KFENCE error reports
+ * which include stack traces to the user of the object, the original allocation
+ * site, and caller to shutdown_cache().
+ */
+void kfence_shutdown_cache(struct kmem_cache *s);
+
+/*
+ * Allocate a KFENCE object. Allocators must not call this function directly,
+ * use kfence_alloc() instead.
+ */
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags);
+
+/**
+ * kfence_alloc() - allocate a KFENCE object with a low probability
+ * @s:     struct kmem_cache with object requirements
+ * @size:  exact size of the object to allocate (can be less than @s->size
+ *         e.g. for kmalloc caches)
+ * @flags: GFP flags
+ *
+ * Return:
+ * * NULL     - must proceed with allocating as usual,
+ * * non-NULL - pointer to a KFENCE object.
+ *
+ * kfence_alloc() should be inserted into the heap allocation fast path,
+ * allowing it to transparently return KFENCE-allocated objects with a low
+ * probability using a static branch (the probability is controlled by the
+ * kfence.sample_interval boot parameter).
+ */
+static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+	if (static_branch_unlikely(&kfence_allocation_key))
+#else
+	if (unlikely(!atomic_read(&kfence_allocation_gate)))
+#endif
+		return __kfence_alloc(s, size, flags);
+	return NULL;
+}
+
+/**
+ * kfence_ksize() - get actual amount of memory allocated for a KFENCE object
+ * @addr: pointer to a heap object
+ *
+ * Return:
+ * * 0     - not a KFENCE object, must call __ksize() instead,
+ * * non-0 - this many bytes can be accessed without causing a memory error.
+ *
+ * kfence_ksize() returns the number of bytes requested for a KFENCE object at
+ * allocation time. This number may be less than the object size of the
+ * corresponding struct kmem_cache.
+ */
+size_t kfence_ksize(const void *addr);
+
+/**
+ * kfence_object_start() - find the beginning of a KFENCE object
+ * @addr: address within a KFENCE-allocated object
+ *
+ * Return: address of the beginning of the object.
+ *
+ * SL[AU]B-allocated objects are laid out within a page one by one, so it is
+ * easy to calculate the beginning of an object given a pointer inside it and
+ * the object size. The same is not true for KFENCE, which places a single
+ * object at either end of the page. This helper function is used to find the
+ * beginning of a KFENCE-allocated object.
+ */
+void *kfence_object_start(const void *addr);
+
+/**
+ * __kfence_free() - release a KFENCE heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Requires: is_kfence_address(addr)
+ *
+ * Release a KFENCE object and mark it as freed.
+ */
+void __kfence_free(void *addr);
+
+/**
+ * kfence_free() - try to release an arbitrary heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Return:
+ * * false - object doesn't belong to KFENCE pool and was ignored,
+ * * true  - object was released to KFENCE pool.
+ *
+ * Release a KFENCE object and mark it as freed. May be called on any object,
+ * even non-KFENCE objects, to simplify integration of the hooks into the
+ * allocator's free codepath. The allocator must check the return value to
+ * determine if it was a KFENCE object or not.
+ */
+static __always_inline __must_check bool kfence_free(void *addr)
+{
+	if (!is_kfence_address(addr))
+		return false;
+	__kfence_free(addr);
+	return true;
+}
+
+/**
+ * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
+ * @addr: faulting address
+ *
+ * Return:
+ * * false - address outside KFENCE pool,
+ * * true  - page fault handled by KFENCE, no additional handling required.
+ *
+ * A page fault inside KFENCE pool indicates a memory error, such as an
+ * out-of-bounds access, a use-after-free or an invalid memory access. In these
+ * cases KFENCE prints an error message and marks the offending page as
+ * present, so that the kernel can proceed.
+ */
+bool __must_check kfence_handle_page_fault(unsigned long addr);
+
+#else /* CONFIG_KFENCE */
+
+static inline bool is_kfence_address(const void *addr) { return false; }
+static inline void kfence_alloc_pool(void) { }
+static inline void kfence_init(void) { }
+static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
+static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
+static inline size_t kfence_ksize(const void *addr) { return 0; }
+static inline void *kfence_object_start(const void *addr) { return NULL; }
+static inline void __kfence_free(void *addr) { }
+static inline bool __must_check kfence_free(void *addr) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; }
+
+#endif
+
+#endif /* _LINUX_KFENCE_H */
diff --git a/init/main.c b/init/main.c
index e9933cbf60d4..261051070e3c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -40,6 +40,7 @@
 #include <linux/security.h>
 #include <linux/smp.h>
 #include <linux/profile.h>
+#include <linux/kfence.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
@@ -824,6 +825,7 @@ static void __init mm_init(void)
 	 */
 	page_ext_init_flatmem();
 	init_mem_debugging_and_hardening();
+	kfence_alloc_pool();
 	report_meminit();
 	mem_init();
 	/* page_owner must be initialized after buddy is ready */
@@ -955,6 +957,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 	hrtimers_init();
 	softirq_init();
 	timekeeping_init();
+	kfence_init();
 
 	/*
 	 * For best initial stack canary entropy, prepare it after:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9febffffc21..2779c29d9981 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -938,6 +938,7 @@ config DEBUG_STACKOVERFLOW
 	  If in doubt, say "N".
 
 source "lib/Kconfig.kasan"
+source "lib/Kconfig.kfence"
 
 endmenu # "Memory Debugging"
 
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
new file mode 100644
index 000000000000..b88ac9d6b2e6
--- /dev/null
+++ b/lib/Kconfig.kfence
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config HAVE_ARCH_KFENCE
+	bool
+
+menuconfig KFENCE
+	bool "KFENCE: low-overhead sampling-based memory safety error detector"
+	depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB)
+	select STACKTRACE
+	help
+	  KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
+	  access, use-after-free, and invalid-free errors. KFENCE is designed
+	  to have negligible cost to permit enabling it in production
+	  environments.
+
+	  Note that, KFENCE is not a substitute for explicit testing with tools
+	  such as KASAN. KFENCE can detect a subset of bugs that KASAN can
+	  detect, albeit at very different performance profiles. If you can
+	  afford to use KASAN, continue using KASAN, for example in test
+	  environments. If your kernel targets production use, and cannot
+	  enable KASAN due to its cost, consider using KFENCE.
+
+if KFENCE
+
+config KFENCE_STATIC_KEYS
+	bool "Use static keys to set up allocations"
+	default y
+	depends on JUMP_LABEL # To ensure performance, require jump labels
+	help
+	  Use static keys (static branches) to set up KFENCE allocations. Using
+	  static keys is normally recommended, because it avoids a dynamic
+	  branch in the allocator's fast path. However, with very low sample
+	  intervals, or on systems that do not support jump labels, a dynamic
+	  branch may still be an acceptable performance trade-off.
+
+config KFENCE_SAMPLE_INTERVAL
+	int "Default sample interval in milliseconds"
+	default 100
+	help
+	  The KFENCE sample interval determines the frequency with which heap
+	  allocations will be guarded by KFENCE. May be overridden via boot
+	  parameter "kfence.sample_interval".
+
+	  Set this to 0 to disable KFENCE by default, in which case only
+	  setting "kfence.sample_interval" to a non-zero value enables KFENCE.
+
+config KFENCE_NUM_OBJECTS
+	int "Number of guarded objects available"
+	range 1 65535
+	default 255
+	help
+	  The number of guarded objects available. For each KFENCE object, 2
+	  pages are required; with one containing the object and two adjacent
+	  ones used as guard pages.
+
+config KFENCE_STRESS_TEST_FAULTS
+	int "Stress testing of fault handling and error reporting" if EXPERT
+	default 0
+	help
+	  The inverse probability with which to randomly protect KFENCE object
+	  pages, resulting in spurious use-after-frees. The main purpose of
+	  this option is to stress test KFENCE with concurrent error reports
+	  and allocations/frees. A value of 0 disables stress testing logic.
+
+	  Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
+
+endif # KFENCE
diff --git a/mm/Makefile b/mm/Makefile
index 135bbb65511a..72227b24a616 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KASAN)	+= kasan/
+obj-$(CONFIG_KFENCE) += kfence/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
new file mode 100644
index 000000000000..d991e9a349f0
--- /dev/null
+++ b/mm/kfence/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KFENCE) := core.o report.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
new file mode 100644
index 000000000000..d6a32c13336b
--- /dev/null
+++ b/mm/kfence/core.c
@@ -0,0 +1,840 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE guarded object allocator and fault handling.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kfence: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kfence.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Disables KFENCE on the first warning assuming an irrecoverable error. */
+#define KFENCE_WARN_ON(cond)                                                   \
+	({                                                                     \
+		const bool __cond = WARN_ON(cond);                             \
+		if (unlikely(__cond))                                          \
+			WRITE_ONCE(kfence_enabled, false);                     \
+		__cond;                                                        \
+	})
+
+/* === Data ================================================================= */
+
+static bool kfence_enabled __read_mostly;
+
+static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kfence."
+
+static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	int ret = kstrtoul(val, 0, &num);
+
+	if (ret < 0)
+		return ret;
+
+	if (!num) /* Using 0 to indicate KFENCE is disabled. */
+		WRITE_ONCE(kfence_enabled, false);
+	else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+		return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
+
+	*((unsigned long *)kp->arg) = num;
+	return 0;
+}
+
+static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
+{
+	if (!READ_ONCE(kfence_enabled))
+		return sprintf(buffer, "0\n");
+
+	return param_get_ulong(buffer, kp);
+}
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+	.set = param_set_sample_interval,
+	.get = param_get_sample_interval,
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+
+/* The pool of pages used for guard pages and objects. */
+char *__kfence_pool __ro_after_init;
+EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
+
+/*
+ * Per-object metadata, with one-to-one mapping of object metadata to
+ * backing pages (in __kfence_pool).
+ */
+static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
+struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* Freelist with available objects. */
+static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
+static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* The static key to set up a KFENCE allocation. */
+DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
+#endif
+
+/* Gates the allocation, ensuring only one succeeds in a given period. */
+atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+
+/* Statistics counters for debugfs. */
+enum kfence_counter_id {
+	KFENCE_COUNTER_ALLOCATED,
+	KFENCE_COUNTER_ALLOCS,
+	KFENCE_COUNTER_FREES,
+	KFENCE_COUNTER_ZOMBIES,
+	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_COUNT,
+};
+static atomic_long_t counters[KFENCE_COUNTER_COUNT];
+static const char *const counter_names[] = {
+	[KFENCE_COUNTER_ALLOCATED]	= "currently allocated",
+	[KFENCE_COUNTER_ALLOCS]		= "total allocations",
+	[KFENCE_COUNTER_FREES]		= "total frees",
+	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
+	[KFENCE_COUNTER_BUGS]		= "total bugs",
+};
+static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
+
+/* === Internals ============================================================ */
+
+static bool kfence_protect(unsigned long addr)
+{
+	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
+}
+
+static bool kfence_unprotect(unsigned long addr)
+{
+	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
+}
+
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+	long index;
+
+	/* The checks do not affect performance; only called from slow-paths. */
+
+	if (!is_kfence_address((void *)addr))
+		return NULL;
+
+	/*
+	 * May be an invalid index if called with an address at the edge of
+	 * __kfence_pool, in which case we would report an "invalid access"
+	 * error.
+	 */
+	index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+	if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+		return NULL;
+
+	return &kfence_metadata[index];
+}
+
+static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
+{
+	unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
+	unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
+
+	/* The checks do not affect performance; only called from slow-paths. */
+
+	/* Only call with a pointer into kfence_metadata. */
+	if (KFENCE_WARN_ON(meta < kfence_metadata ||
+			   meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
+		return 0;
+
+	/*
+	 * This metadata object only ever maps to 1 page; verify that the stored
+	 * address is in the expected range.
+	 */
+	if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
+		return 0;
+
+	return pageaddr;
+}
+
+/*
+ * Update the object's metadata state, including updating the alloc/free stacks
+ * depending on the state transition.
+ */
+static noinline void metadata_update_state(struct kfence_metadata *meta,
+					   enum kfence_object_state next)
+{
+	struct kfence_track *track =
+		next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+
+	lockdep_assert_held(&meta->lock);
+
+	/*
+	 * Skip over 1 (this) functions; noinline ensures we do not accidentally
+	 * skip over the caller by never inlining.
+	 */
+	track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	track->pid = task_pid_nr(current);
+
+	/*
+	 * Pairs with READ_ONCE() in
+	 *	kfence_shutdown_cache(),
+	 *	kfence_handle_page_fault().
+	 */
+	WRITE_ONCE(meta->state, next);
+}
+
+/* Write canary byte to @addr. */
+static inline bool set_canary_byte(u8 *addr)
+{
+	*addr = KFENCE_CANARY_PATTERN(addr);
+	return true;
+}
+
+/* Check canary byte at @addr. */
+static inline bool check_canary_byte(u8 *addr)
+{
+	if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
+		return true;
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+	kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr),
+			    KFENCE_ERROR_CORRUPTION);
+	return false;
+}
+
+/* __always_inline this to ensure we won't do an indirect call to fn. */
+static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
+{
+	const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+	unsigned long addr;
+
+	lockdep_assert_held(&meta->lock);
+
+	/*
+	 * We'll iterate over each canary byte per-side until fn() returns
+	 * false. However, we'll still iterate over the canary bytes to the
+	 * right of the object even if there was an error in the canary bytes to
+	 * the left of the object. Specifically, if check_canary_byte()
+	 * generates an error, showing both sides might give more clues as to
+	 * what the error is about when displaying which bytes were corrupted.
+	 */
+
+	/* Apply to left of object. */
+	for (addr = pageaddr; addr < meta->addr; addr++) {
+		if (!fn((u8 *)addr))
+			break;
+	}
+
+	/* Apply to right of object. */
+	for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
+		if (!fn((u8 *)addr))
+			break;
+	}
+}
+
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+{
+	struct kfence_metadata *meta = NULL;
+	unsigned long flags;
+	struct page *page;
+	void *addr;
+
+	/* Try to obtain a free object. */
+	raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+	if (!list_empty(&kfence_freelist)) {
+		meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
+		list_del_init(&meta->list);
+	}
+	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+	if (!meta)
+		return NULL;
+
+	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
+		/*
+		 * This is extremely unlikely -- we are reporting on a
+		 * use-after-free, which locked meta->lock, and the reporting
+		 * code via printk calls kmalloc() which ends up in
+		 * kfence_alloc() and tries to grab the same object that we're
+		 * reporting on. While it has never been observed, lockdep does
+		 * report that there is a possibility of deadlock. Fix it by
+		 * using trylock and bailing out gracefully.
+		 */
+		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+		/* Put the object back on the freelist. */
+		list_add_tail(&meta->list, &kfence_freelist);
+		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+		return NULL;
+	}
+
+	meta->addr = metadata_to_pageaddr(meta);
+	/* Unprotect if we're reusing this page. */
+	if (meta->state == KFENCE_OBJECT_FREED)
+		kfence_unprotect(meta->addr);
+
+	/*
+	 * Note: for allocations made before RNG initialization, will always
+	 * return zero. We still benefit from enabling KFENCE as early as
+	 * possible, even when the RNG is not yet available, as this will allow
+	 * KFENCE to detect bugs due to earlier allocations. The only downside
+	 * is that the out-of-bounds accesses detected are deterministic for
+	 * such allocations.
+	 */
+	if (prandom_u32_max(2)) {
+		/* Allocate on the "right" side, re-calculate address. */
+		meta->addr += PAGE_SIZE - size;
+		meta->addr = ALIGN_DOWN(meta->addr, cache->align);
+	}
+
+	addr = (void *)meta->addr;
+
+	/* Update remaining metadata. */
+	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
+	WRITE_ONCE(meta->cache, cache);
+	meta->size = size;
+	for_each_canary(meta, set_canary_byte);
+
+	/* Set required struct page fields. */
+	page = virt_to_page(meta->addr);
+	page->slab_cache = cache;
+
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+	/* Memory initialization. */
+
+	/*
+	 * We check slab_want_init_on_alloc() ourselves, rather than letting
+	 * SL*B do the initialization, as otherwise we might overwrite KFENCE's
+	 * redzone.
+	 */
+	if (unlikely(slab_want_init_on_alloc(gfp, cache)))
+		memzero_explicit(addr, size);
+	if (cache->ctor)
+		cache->ctor(addr);
+
+	if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
+		kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
+	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
+
+	return addr;
+}
+
+static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
+{
+	struct kcsan_scoped_access assert_page_exclusive;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&meta->lock, flags);
+
+	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+		/* Invalid or double-free, bail out. */
+		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+		kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE);
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+		return;
+	}
+
+	/* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
+	kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
+				  KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
+				  &assert_page_exclusive);
+
+	if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
+		kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
+
+	/* Restore page protection if there was an OOB access. */
+	if (meta->unprotected_page) {
+		kfence_protect(meta->unprotected_page);
+		meta->unprotected_page = 0;
+	}
+
+	/* Check canary bytes for memory corruption. */
+	for_each_canary(meta, check_canary_byte);
+
+	/*
+	 * Clear memory if init-on-free is set. While we protect the page, the
+	 * data is still there, and after a use-after-free is detected, we
+	 * unprotect the page, so the data is still accessible.
+	 */
+	if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+		memzero_explicit(addr, meta->size);
+
+	/* Mark the object as freed. */
+	metadata_update_state(meta, KFENCE_OBJECT_FREED);
+
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+	/* Protect to detect use-after-frees. */
+	kfence_protect((unsigned long)addr);
+
+	kcsan_end_scoped_access(&assert_page_exclusive);
+	if (!zombie) {
+		/* Add it to the tail of the freelist for reuse. */
+		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+		KFENCE_WARN_ON(!list_empty(&meta->list));
+		list_add_tail(&meta->list, &kfence_freelist);
+		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+		atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
+		atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
+	} else {
+		/* See kfence_shutdown_cache(). */
+		atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
+	}
+}
+
+static void rcu_guarded_free(struct rcu_head *h)
+{
+	struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
+
+	kfence_guarded_free((void *)meta->addr, meta, false);
+}
+
+static bool __init kfence_init_pool(void)
+{
+	unsigned long addr = (unsigned long)__kfence_pool;
+	struct page *pages;
+	int i;
+
+	if (!__kfence_pool)
+		return false;
+
+	if (!arch_kfence_init_pool())
+		goto err;
+
+	pages = virt_to_page(addr);
+
+	/*
+	 * Set up object pages: they must have PG_slab set, to avoid freeing
+	 * these as real pages.
+	 *
+	 * We also want to avoid inserting kfence_free() in the kfree()
+	 * fast-path in SLUB, and therefore need to ensure kfree() correctly
+	 * enters __slab_free() slow-path.
+	 */
+	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+		if (!i || (i % 2))
+			continue;
+
+		/* Verify we do not have a compound head page. */
+		if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+			goto err;
+
+		__SetPageSlab(&pages[i]);
+	}
+
+	/*
+	 * Protect the first 2 pages. The first page is mostly unnecessary, and
+	 * merely serves as an extended guard page. However, adding one
+	 * additional page in the beginning gives us an even number of pages,
+	 * which simplifies the mapping of address to metadata index.
+	 */
+	for (i = 0; i < 2; i++) {
+		if (unlikely(!kfence_protect(addr)))
+			goto err;
+
+		addr += PAGE_SIZE;
+	}
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		struct kfence_metadata *meta = &kfence_metadata[i];
+
+		/* Initialize metadata. */
+		INIT_LIST_HEAD(&meta->list);
+		raw_spin_lock_init(&meta->lock);
+		meta->state = KFENCE_OBJECT_UNUSED;
+		meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
+		list_add_tail(&meta->list, &kfence_freelist);
+
+		/* Protect the right redzone. */
+		if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+			goto err;
+
+		addr += 2 * PAGE_SIZE;
+	}
+
+	return true;
+
+err:
+	/*
+	 * Only release unprotected pages, and do not try to go back and change
+	 * page attributes due to risk of failing to do so as well. If changing
+	 * page attributes for some pages fails, it is very likely that it also
+	 * fails for the first page, and therefore expect addr==__kfence_pool in
+	 * most failure cases.
+	 */
+	memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+	__kfence_pool = NULL;
+	return false;
+}
+
+/* === DebugFS Interface ==================================================== */
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
+	for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
+		seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+/*
+ * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
+ * start_object() and next_object() return the object index + 1, because NULL is used
+ * to stop iteration.
+ */
+static void *start_object(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+		return (void *)((long)*pos + 1);
+	return NULL;
+}
+
+static void stop_object(struct seq_file *seq, void *v)
+{
+}
+
+static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+		return (void *)((long)*pos + 1);
+	return NULL;
+}
+
+static int show_object(struct seq_file *seq, void *v)
+{
+	struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&meta->lock, flags);
+	kfence_print_object(seq, meta);
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+	seq_puts(seq, "---------------------------------\n");
+
+	return 0;
+}
+
+static const struct seq_operations object_seqops = {
+	.start = start_object,
+	.next = next_object,
+	.stop = stop_object,
+	.show = show_object,
+};
+
+static int open_objects(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &object_seqops);
+}
+
+static const struct file_operations objects_fops = {
+	.open = open_objects,
+	.read = seq_read,
+	.llseek = seq_lseek,
+};
+
+static int __init kfence_debugfs_init(void)
+{
+	struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+
+	debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
+	debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
+	return 0;
+}
+
+late_initcall(kfence_debugfs_init);
+
+/* === Allocation Gate Timer ================================================ */
+
+/*
+ * Set up delayed work, which will enable and disable the static key. We need to
+ * use a work queue (rather than a simple timer), since enabling and disabling a
+ * static key cannot be done from an interrupt.
+ *
+ * Note: Toggling a static branch currently causes IPIs, and here we'll end up
+ * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
+ * more aggressive sampling intervals), we could get away with a variant that
+ * avoids IPIs, at the cost of not immediately capturing allocations if the
+ * instructions remain cached.
+ */
+static struct delayed_work kfence_timer;
+static void toggle_allocation_gate(struct work_struct *work)
+{
+	if (!READ_ONCE(kfence_enabled))
+		return;
+
+	/* Enable static key, and await allocation to happen. */
+	atomic_set(&kfence_allocation_gate, 0);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+	static_branch_enable(&kfence_allocation_key);
+	/*
+	 * Await an allocation. Timeout after 1 second, in case the kernel stops
+	 * doing allocations, to avoid stalling this worker task for too long.
+	 */
+	{
+		unsigned long end_wait = jiffies + HZ;
+
+		do {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&kfence_allocation_gate) != 0)
+				break;
+			schedule_timeout(1);
+		} while (time_before(jiffies, end_wait));
+		__set_current_state(TASK_RUNNING);
+	}
+	/* Disable static key and reset timer. */
+	static_branch_disable(&kfence_allocation_key);
+#endif
+	schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+}
+static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
+
+/* === Public interface ===================================================== */
+
+void __init kfence_alloc_pool(void)
+{
+	if (!kfence_sample_interval)
+		return;
+
+	__kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+	if (!__kfence_pool)
+		pr_err("failed to allocate pool\n");
+}
+
+void __init kfence_init(void)
+{
+	/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
+	if (!kfence_sample_interval)
+		return;
+
+	if (!kfence_init_pool()) {
+		pr_err("%s failed\n", __func__);
+		return;
+	}
+
+	WRITE_ONCE(kfence_enabled, true);
+	schedule_delayed_work(&kfence_timer, 0);
+	pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE,
+		CONFIG_KFENCE_NUM_OBJECTS);
+	if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+		pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool,
+			(void *)(__kfence_pool + KFENCE_POOL_SIZE));
+	else
+		pr_cont("\n");
+}
+
+void kfence_shutdown_cache(struct kmem_cache *s)
+{
+	unsigned long flags;
+	struct kfence_metadata *meta;
+	int i;
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		bool in_use;
+
+		meta = &kfence_metadata[i];
+
+		/*
+		 * If we observe some inconsistent cache and state pair where we
+		 * should have returned false here, cache destruction is racing
+		 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
+		 * the lock will not help, as different critical section
+		 * serialization will have the same outcome.
+		 */
+		if (READ_ONCE(meta->cache) != s ||
+		    READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+			continue;
+
+		raw_spin_lock_irqsave(&meta->lock, flags);
+		in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+		if (in_use) {
+			/*
+			 * This cache still has allocations, and we should not
+			 * release them back into the freelist so they can still
+			 * safely be used and retain the kernel's default
+			 * behaviour of keeping the allocations alive (leak the
+			 * cache); however, they effectively become "zombie
+			 * allocations" as the KFENCE objects are the only ones
+			 * still in use and the owning cache is being destroyed.
+			 *
+			 * We mark them freed, so that any subsequent use shows
+			 * more useful error messages that will include stack
+			 * traces of the user of the object, the original
+			 * allocation, and caller to shutdown_cache().
+			 */
+			kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
+		}
+	}
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		meta = &kfence_metadata[i];
+
+		/* See above. */
+		if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
+			continue;
+
+		raw_spin_lock_irqsave(&meta->lock, flags);
+		if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
+			meta->cache = NULL;
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+	}
+}
+
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+	/*
+	 * allocation_gate only needs to become non-zero, so it doesn't make
+	 * sense to continue writing to it and pay the associated contention
+	 * cost, in case we have a large number of concurrent allocations.
+	 */
+	if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+		return NULL;
+
+	if (!READ_ONCE(kfence_enabled))
+		return NULL;
+
+	if (size > PAGE_SIZE)
+		return NULL;
+
+	return kfence_guarded_alloc(s, size, flags);
+}
+
+size_t kfence_ksize(const void *addr)
+{
+	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
+	 * either a use-after-free or invalid access.
+	 */
+	return meta ? meta->size : 0;
+}
+
+void *kfence_object_start(const void *addr)
+{
+	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
+	 * either a use-after-free or invalid access.
+	 */
+	return meta ? (void *)meta->addr : NULL;
+}
+
+void __kfence_free(void *addr)
+{
+	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+	 * the object, as the object page may be recycled for other-typed
+	 * objects once it has been freed. meta->cache may be NULL if the cache
+	 * was destroyed.
+	 */
+	if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+		call_rcu(&meta->rcu_head, rcu_guarded_free);
+	else
+		kfence_guarded_free(addr, meta, false);
+}
+
+bool kfence_handle_page_fault(unsigned long addr)
+{
+	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
+	struct kfence_metadata *to_report = NULL;
+	enum kfence_error_type error_type;
+	unsigned long flags;
+
+	if (!is_kfence_address((void *)addr))
+		return false;
+
+	if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
+		return kfence_unprotect(addr); /* ... unprotect and proceed. */
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+	if (page_index % 2) {
+		/* This is a redzone, report a buffer overflow. */
+		struct kfence_metadata *meta;
+		int distance = 0;
+
+		meta = addr_to_metadata(addr - PAGE_SIZE);
+		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+			to_report = meta;
+			/* Data race ok; distance calculation approximate. */
+			distance = addr - data_race(meta->addr + meta->size);
+		}
+
+		meta = addr_to_metadata(addr + PAGE_SIZE);
+		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+			/* Data race ok; distance calculation approximate. */
+			if (!to_report || distance > data_race(meta->addr) - addr)
+				to_report = meta;
+		}
+
+		if (!to_report)
+			goto out;
+
+		raw_spin_lock_irqsave(&to_report->lock, flags);
+		to_report->unprotected_page = addr;
+		error_type = KFENCE_ERROR_OOB;
+
+		/*
+		 * If the object was freed before we took the look we can still
+		 * report this as an OOB -- the report will simply show the
+		 * stacktrace of the free as well.
+		 */
+	} else {
+		to_report = addr_to_metadata(addr);
+		if (!to_report)
+			goto out;
+
+		raw_spin_lock_irqsave(&to_report->lock, flags);
+		error_type = KFENCE_ERROR_UAF;
+		/*
+		 * We may race with __kfence_alloc(), and it is possible that a
+		 * freed object may be reallocated. We simply report this as a
+		 * use-after-free, with the stack trace showing the place where
+		 * the object was re-allocated.
+		 */
+	}
+
+out:
+	if (to_report) {
+		kfence_report_error(addr, to_report, error_type);
+		raw_spin_unlock_irqrestore(&to_report->lock, flags);
+	} else {
+		/* This may be a UAF or OOB access, but we can't be sure. */
+		kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID);
+	}
+
+	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
+}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
new file mode 100644
index 000000000000..1014060f9707
--- /dev/null
+++ b/mm/kfence/kfence.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). For more info please see
+ * Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef MM_KFENCE_KFENCE_H
+#define MM_KFENCE_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../slab.h" /* for struct kmem_cache */
+
+/* For non-debug builds, avoid leaking kernel pointers into dmesg. */
+#ifdef CONFIG_DEBUG_KERNEL
+#define PTR_FMT "%px"
+#else
+#define PTR_FMT "%p"
+#endif
+
+/*
+ * Get the canary byte pattern for @addr. Use a pattern that varies based on the
+ * lower 3 bits of the address, to detect memory corruptions with higher
+ * probability, where similar constants are used.
+ */
+#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
+
+/* Maximum stack depth for reports. */
+#define KFENCE_STACK_DEPTH 64
+
+/* KFENCE object states. */
+enum kfence_object_state {
+	KFENCE_OBJECT_UNUSED,		/* Object is unused. */
+	KFENCE_OBJECT_ALLOCATED,	/* Object is currently allocated. */
+	KFENCE_OBJECT_FREED,		/* Object was allocated, and then freed. */
+};
+
+/* Alloc/free tracking information. */
+struct kfence_track {
+	pid_t pid;
+	int num_stack_entries;
+	unsigned long stack_entries[KFENCE_STACK_DEPTH];
+};
+
+/* KFENCE metadata per guarded allocation. */
+struct kfence_metadata {
+	struct list_head list;		/* Freelist node; access under kfence_freelist_lock. */
+	struct rcu_head rcu_head;	/* For delayed freeing. */
+
+	/*
+	 * Lock protecting below data; to ensure consistency of the below data,
+	 * since the following may execute concurrently: __kfence_alloc(),
+	 * __kfence_free(), kfence_handle_page_fault(). However, note that we
+	 * cannot grab the same metadata off the freelist twice, and multiple
+	 * __kfence_alloc() cannot run concurrently on the same metadata.
+	 */
+	raw_spinlock_t lock;
+
+	/* The current state of the object; see above. */
+	enum kfence_object_state state;
+
+	/*
+	 * Allocated object address; cannot be calculated from size, because of
+	 * alignment requirements.
+	 *
+	 * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
+	 */
+	unsigned long addr;
+
+	/*
+	 * The size of the original allocation.
+	 */
+	size_t size;
+
+	/*
+	 * The kmem_cache cache of the last allocation; NULL if never allocated
+	 * or the cache has already been destroyed.
+	 */
+	struct kmem_cache *cache;
+
+	/*
+	 * In case of an invalid access, the page that was unprotected; we
+	 * optimistically only store one address.
+	 */
+	unsigned long unprotected_page;
+
+	/* Allocation and free stack information. */
+	struct kfence_track alloc_track;
+	struct kfence_track free_track;
+};
+
+extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* KFENCE error types for report generation. */
+enum kfence_error_type {
+	KFENCE_ERROR_OOB,		/* Detected a out-of-bounds access. */
+	KFENCE_ERROR_UAF,		/* Detected a use-after-free access. */
+	KFENCE_ERROR_CORRUPTION,	/* Detected a memory corruption on free. */
+	KFENCE_ERROR_INVALID,		/* Invalid access of unknown type. */
+	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
+};
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+			 enum kfence_error_type type);
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
+
+#endif /* MM_KFENCE_KFENCE_H */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
new file mode 100644
index 000000000000..64f27c8d46a3
--- /dev/null
+++ b/mm/kfence/report.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE reporting.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
+#include <linux/seq_file.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Helper function to either print to a seq_file or to console. */
+__printf(2, 3)
+static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	if (seq)
+		seq_vprintf(seq, fmt, args);
+	else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
+/*
+ * Get the number of stack entries to skip to get out of MM internals. @type is
+ * optional, and if set to NULL, assumes an allocation or free stack.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
+			    const enum kfence_error_type *type)
+{
+	char buf[64];
+	int skipnr, fallback = 0;
+	bool is_access_fault = false;
+
+	if (type) {
+		/* Depending on error type, find different stack entries. */
+		switch (*type) {
+		case KFENCE_ERROR_UAF:
+		case KFENCE_ERROR_OOB:
+		case KFENCE_ERROR_INVALID:
+			is_access_fault = true;
+			break;
+		case KFENCE_ERROR_CORRUPTION:
+		case KFENCE_ERROR_INVALID_FREE:
+			break;
+		}
+	}
+
+	for (skipnr = 0; skipnr < num_entries; skipnr++) {
+		int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
+
+		if (is_access_fault) {
+			if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len))
+				goto found;
+		} else {
+			if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
+			    !strncmp(buf, "__slab_free", len)) {
+				/*
+				 * In case of tail calls from any of the below
+				 * to any of the above.
+				 */
+				fallback = skipnr + 1;
+			}
+
+			/* Also the *_bulk() variants by only checking prefixes. */
+			if (str_has_prefix(buf, "kfree") ||
+			    str_has_prefix(buf, "kmem_cache_free") ||
+			    str_has_prefix(buf, "__kmalloc") ||
+			    str_has_prefix(buf, "kmem_cache_alloc"))
+				goto found;
+		}
+	}
+	if (fallback < num_entries)
+		return fallback;
+found:
+	skipnr++;
+	return skipnr < num_entries ? skipnr : 0;
+}
+
+static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta,
+			       bool show_alloc)
+{
+	const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+
+	if (track->num_stack_entries) {
+		/* Skip allocation/free internals stack. */
+		int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+
+		/* stack_trace_seq_print() does not exist; open code our own. */
+		for (; i < track->num_stack_entries; i++)
+			seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]);
+	} else {
+		seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation");
+	}
+}
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta)
+{
+	const int size = abs(meta->size);
+	const unsigned long start = meta->addr;
+	const struct kmem_cache *const cache = meta->cache;
+
+	lockdep_assert_held(&meta->lock);
+
+	if (meta->state == KFENCE_OBJECT_UNUSED) {
+		seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
+		return;
+	}
+
+	seq_con_printf(seq,
+		       "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT
+		       ", size=%d, cache=%s] allocated by task %d:\n",
+		       meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
+		       (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
+	kfence_print_stack(seq, meta, true);
+
+	if (meta->state == KFENCE_OBJECT_FREED) {
+		seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
+		kfence_print_stack(seq, meta, false);
+	}
+}
+
+/*
+ * Show bytes at @addr that are different from the expected canary values, up to
+ * @max_bytes.
+ */
+static void print_diff_canary(unsigned long address, size_t bytes_to_show,
+			      const struct kfence_metadata *meta)
+{
+	const unsigned long show_until_addr = address + bytes_to_show;
+	const u8 *cur, *end;
+
+	/* Do not show contents of object nor read into following guard page. */
+	end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
+						: min(show_until_addr, PAGE_ALIGN(address)));
+
+	pr_cont("[");
+	for (cur = (const u8 *)address; cur < end; cur++) {
+		if (*cur == KFENCE_CANARY_PATTERN(cur))
+			pr_cont(" .");
+		else if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+			pr_cont(" 0x%02x", *cur);
+		else /* Do not leak kernel memory in non-debug builds. */
+			pr_cont(" !");
+	}
+	pr_cont(" ]");
+}
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+			 enum kfence_error_type type)
+{
+	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
+	int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+	int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+	const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+
+	/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
+	if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
+		return;
+
+	if (meta)
+		lockdep_assert_held(&meta->lock);
+	/*
+	 * Because we may generate reports in printk-unfriendly parts of the
+	 * kernel, such as scheduler code, the use of printk() could deadlock.
+	 * Until such time that all printing code here is safe in all parts of
+	 * the kernel, accept the risk, and just get our message out (given the
+	 * system might already behave unpredictably due to the memory error).
+	 * As such, also disable lockdep to hide warnings, and avoid disabling
+	 * lockdep for the rest of the kernel.
+	 */
+	lockdep_off();
+
+	pr_err("==================================================================\n");
+	/* Print report header. */
+	switch (type) {
+	case KFENCE_ERROR_OOB: {
+		const bool left_of_object = address < meta->addr;
+
+		pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+		       (void *)address,
+		       left_of_object ? meta->addr - address : address - meta->addr,
+		       left_of_object ? "left" : "right", object_index);
+		break;
+	}
+	case KFENCE_ERROR_UAF:
+		pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
+		       (void *)address, object_index);
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address);
+		print_diff_canary(address, 16, meta);
+		pr_cont(" (in kfence-#%zd):\n", object_index);
+		break;
+	case KFENCE_ERROR_INVALID:
+		pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address,
+		       object_index);
+		break;
+	}
+
+	/* Print stack trace and object info. */
+	stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);
+
+	if (meta) {
+		pr_err("\n");
+		kfence_print_object(NULL, meta);
+	}
+
+	/* Print report footer. */
+	pr_err("\n");
+	dump_stack_print_info(KERN_ERR);
+	pr_err("==================================================================\n");
+
+	lockdep_on();
+
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
+
+	/* We encountered a memory unsafety error, taint the kernel! */
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
+}
-- 
cgit v1.2.3


From d438fabce7860df3cb9337776be6f90b59ced8ed Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 25 Feb 2021 17:19:08 -0800
Subject: kfence: use pt_regs to generate stack trace on faults

Instead of removing the fault handling portion of the stack trace based on
the fault handler's name, just use struct pt_regs directly.

Change kfence_handle_page_fault() to take a struct pt_regs, and plumb it
through to kfence_report_error() for out-of-bounds, use-after-free, or
invalid access errors, where pt_regs is used to generate the stack trace.

If the kernel is a DEBUG_KERNEL, also show registers for more information.

Link: https://lkml.kernel.org/r/20201105092133.2075331-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/kfence.h |  2 --
 arch/arm64/mm/fault.c           |  2 +-
 arch/x86/include/asm/kfence.h   |  6 ----
 arch/x86/mm/fault.c             |  2 +-
 include/linux/kfence.h          |  5 ++--
 mm/kfence/core.c                | 10 +++----
 mm/kfence/kfence.h              |  4 +--
 mm/kfence/report.c              | 63 +++++++++++++++++++++++------------------
 8 files changed, 48 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
index 42a06f83850a..d061176d57ea 100644
--- a/arch/arm64/include/asm/kfence.h
+++ b/arch/arm64/include/asm/kfence.h
@@ -10,8 +10,6 @@
 
 #include <asm/cacheflush.h>
 
-#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync"
-
 static inline bool arch_kfence_init_pool(void) { return true; }
 
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 42515900ab2e..56d9423ca59c 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (kfence_handle_page_fault(addr))
+		if (kfence_handle_page_fault(addr, regs))
 			return;
 
 		msg = "paging request";
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
index a0659dbd93ea..97bbb4a9083a 100644
--- a/arch/x86/include/asm/kfence.h
+++ b/arch/x86/include/asm/kfence.h
@@ -16,12 +16,6 @@
 #include <asm/set_memory.h>
 #include <asm/tlbflush.h>
 
-/*
- * The page fault handler entry function, up to which the stack trace is
- * truncated in reports.
- */
-#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault"
-
 /* Force 4K pages for __kfence_pool. */
 static inline bool arch_kfence_init_pool(void)
 {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 99fe6d3e690d..38868b4ce8b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -682,7 +682,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		efi_crash_gracefully_on_page_fault(address);
 
 	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address))
+	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs))
 		return;
 
 oops:
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 81f3911cb298..5a56bcf5606c 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
 /**
  * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
  * @addr: faulting address
+ * @regs: current struct pt_regs (can be NULL, but shows full stack trace)
  *
  * Return:
  * * false - address outside KFENCE pool,
@@ -196,7 +197,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
  * cases KFENCE prints an error message and marks the offending page as
  * present, so that the kernel can proceed.
  */
-bool __must_check kfence_handle_page_fault(unsigned long addr);
+bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs);
 
 #else /* CONFIG_KFENCE */
 
@@ -209,7 +210,7 @@ static inline size_t kfence_ksize(const void *addr) { return 0; }
 static inline void *kfence_object_start(const void *addr) { return NULL; }
 static inline void __kfence_free(void *addr) { }
 static inline bool __must_check kfence_free(void *addr) { return false; }
-static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; }
 
 #endif
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d6a32c13336b..61c76670a7a9 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr)
 		return true;
 
 	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-	kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr),
+	kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr),
 			    KFENCE_ERROR_CORRUPTION);
 	return false;
 }
@@ -351,7 +351,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
 		/* Invalid or double-free, bail out. */
 		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-		kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE);
+		kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE);
 		raw_spin_unlock_irqrestore(&meta->lock, flags);
 		return;
 	}
@@ -766,7 +766,7 @@ void __kfence_free(void *addr)
 		kfence_guarded_free(addr, meta, false);
 }
 
-bool kfence_handle_page_fault(unsigned long addr)
+bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
 {
 	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
 	struct kfence_metadata *to_report = NULL;
@@ -829,11 +829,11 @@ bool kfence_handle_page_fault(unsigned long addr)
 
 out:
 	if (to_report) {
-		kfence_report_error(addr, to_report, error_type);
+		kfence_report_error(addr, regs, to_report, error_type);
 		raw_spin_unlock_irqrestore(&to_report->lock, flags);
 	} else {
 		/* This may be a UAF or OOB access, but we can't be sure. */
-		kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID);
+		kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID);
 	}
 
 	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 1014060f9707..0d83e628a97d 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -105,8 +105,8 @@ enum kfence_error_type {
 	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
 };
 
-void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
-			 enum kfence_error_type type);
+void kfence_report_error(unsigned long address, struct pt_regs *regs,
+			 const struct kfence_metadata *meta, enum kfence_error_type type);
 
 void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
 
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 64f27c8d46a3..4dbfa9a382e4 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/lockdep.h>
 #include <linux/printk.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
@@ -41,7 +42,6 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 {
 	char buf[64];
 	int skipnr, fallback = 0;
-	bool is_access_fault = false;
 
 	if (type) {
 		/* Depending on error type, find different stack entries. */
@@ -49,8 +49,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 		case KFENCE_ERROR_UAF:
 		case KFENCE_ERROR_OOB:
 		case KFENCE_ERROR_INVALID:
-			is_access_fault = true;
-			break;
+			/*
+			 * kfence_handle_page_fault() may be called with pt_regs
+			 * set to NULL; in that case we'll simply show the full
+			 * stack trace.
+			 */
+			return 0;
 		case KFENCE_ERROR_CORRUPTION:
 		case KFENCE_ERROR_INVALID_FREE:
 			break;
@@ -60,26 +64,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 	for (skipnr = 0; skipnr < num_entries; skipnr++) {
 		int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
 
-		if (is_access_fault) {
-			if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len))
-				goto found;
-		} else {
-			if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
-			    !strncmp(buf, "__slab_free", len)) {
-				/*
-				 * In case of tail calls from any of the below
-				 * to any of the above.
-				 */
-				fallback = skipnr + 1;
-			}
-
-			/* Also the *_bulk() variants by only checking prefixes. */
-			if (str_has_prefix(buf, "kfree") ||
-			    str_has_prefix(buf, "kmem_cache_free") ||
-			    str_has_prefix(buf, "__kmalloc") ||
-			    str_has_prefix(buf, "kmem_cache_alloc"))
-				goto found;
+		if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
+		    !strncmp(buf, "__slab_free", len)) {
+			/*
+			 * In case of tail calls from any of the below
+			 * to any of the above.
+			 */
+			fallback = skipnr + 1;
 		}
+
+		/* Also the *_bulk() variants by only checking prefixes. */
+		if (str_has_prefix(buf, "kfree") ||
+		    str_has_prefix(buf, "kmem_cache_free") ||
+		    str_has_prefix(buf, "__kmalloc") ||
+		    str_has_prefix(buf, "kmem_cache_alloc"))
+			goto found;
 	}
 	if (fallback < num_entries)
 		return fallback;
@@ -157,13 +156,20 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
 	pr_cont(" ]");
 }
 
-void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
-			 enum kfence_error_type type)
+void kfence_report_error(unsigned long address, struct pt_regs *regs,
+			 const struct kfence_metadata *meta, enum kfence_error_type type)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
-	int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
-	int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
 	const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+	int num_stack_entries;
+	int skipnr = 0;
+
+	if (regs) {
+		num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0);
+	} else {
+		num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+		skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+	}
 
 	/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
 	if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
@@ -227,7 +233,10 @@ void kfence_report_error(unsigned long address, const struct kfence_metadata *me
 
 	/* Print report footer. */
 	pr_err("\n");
-	dump_stack_print_info(KERN_ERR);
+	if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs)
+		show_regs(regs);
+	else
+		dump_stack_print_info(KERN_ERR);
 	pr_err("==================================================================\n");
 
 	lockdep_on();
-- 
cgit v1.2.3


From d3fb45f370d927224af35d22d34ea465884afec8 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:19:11 -0800
Subject: mm, kfence: insert KFENCE hooks for SLAB

Inserts KFENCE hooks into the SLAB allocator.

To pass the originally requested size to KFENCE, add an argument
'orig_size' to slab_alloc*(). The additional argument is required to
preserve the requested original size for kmalloc() allocations, which
uses size classes (e.g. an allocation of 272 bytes will return an object
of size 512). Therefore, kmem_cache::size does not represent the
kmalloc-caller's requested size, and we must introduce the argument
'orig_size' to propagate the originally requested size to KFENCE.

Without the originally requested size, we would not be able to detect
out-of-bounds accesses for objects placed at the end of a KFENCE object
page if that object is not equal to the kmalloc-size class it was
bucketed into.

When KFENCE is disabled, there is no additional overhead, since
slab_alloc*() functions are __always_inline.

Link: https://lkml.kernel.org/r/20201103175841.3495947-5-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Co-developed-by: Marco Elver <elver@google.com>

Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h |  3 +++
 mm/kfence/core.c         |  2 ++
 mm/slab.c                | 38 +++++++++++++++++++++++++++++---------
 mm/slab_common.c         |  5 ++++-
 4 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9eb430c163c2..3aa5e1e73ab6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_SLAB_DEF_H
 #define	_LINUX_SLAB_DEF_H
 
+#include <linux/kfence.h>
 #include <linux/reciprocal_div.h>
 
 /*
@@ -114,6 +115,8 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 static inline int objs_per_slab_page(const struct kmem_cache *cache,
 				     const struct page *page)
 {
+	if (is_kfence_address(page_address(page)))
+		return 1;
 	return cache->num;
 }
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 61c76670a7a9..05c18aa11851 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Set required struct page fields. */
 	page = virt_to_page(meta->addr);
 	page->slab_cache = cache;
+	if (IS_ENABLED(CONFIG_SLAB))
+		page->s_mem = addr;
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
diff --git a/mm/slab.c b/mm/slab.c
index 35c68d99d460..51fd424e0d6d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -100,6 +100,7 @@
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
+#include	<linux/kfence.h>
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
@@ -3208,7 +3209,7 @@ must_grow:
 }
 
 static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
 		   unsigned long caller)
 {
 	unsigned long save_flags;
@@ -3221,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely(!cachep))
 		return NULL;
 
+	ptr = kfence_alloc(cachep, orig_size, flags);
+	if (unlikely(ptr))
+		goto out_hooks;
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 
@@ -3253,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
 		memset(ptr, 0, cachep->object_size);
 
+out_hooks:
 	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
 	return ptr;
 }
@@ -3290,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 
 static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -3301,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely(!cachep))
 		return NULL;
 
+	objp = kfence_alloc(cachep, orig_size, flags);
+	if (unlikely(objp))
+		goto out;
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	objp = __do_cache_alloc(cachep, flags);
@@ -3311,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
 		memset(objp, 0, cachep->object_size);
 
+out:
 	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
 	return objp;
 }
@@ -3416,6 +3427,12 @@ free_done:
 static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
 					 unsigned long caller)
 {
+	if (is_kfence_address(objp)) {
+		kmemleak_free_recursive(objp, cachep->flags);
+		__kfence_free(objp);
+		return;
+	}
+
 	if (unlikely(slab_want_init_on_free(cachep)))
 		memset(objp, 0, cachep->object_size);
 
@@ -3482,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	void *ret = slab_alloc(cachep, flags, _RET_IP_);
+	void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
@@ -3515,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 	local_irq_disable();
 	for (i = 0; i < size; i++) {
-		void *objp = __do_cache_alloc(s, flags);
+		void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
 
 		if (unlikely(!objp))
 			goto error;
@@ -3548,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
 	void *ret;
 
-	ret = slab_alloc(cachep, flags, _RET_IP_);
+	ret = slab_alloc(cachep, flags, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(_RET_IP_, ret,
@@ -3574,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
  */
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
@@ -3592,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 {
 	void *ret;
 
-	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc_node(_RET_IP_, ret,
@@ -3673,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	ret = slab_alloc(cachep, flags, caller);
+	ret = slab_alloc(cachep, flags, size, caller);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(caller, ret,
@@ -4172,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 	BUG_ON(objnr >= cachep->num);
 
 	/* Find offset within object. */
-	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+	if (is_kfence_address(ptr))
+		offset = ptr - kfence_object_start(ptr);
+	else
+		offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
 
 	/* Allow address range falling entirely within usercopy region. */
 	if (offset >= cachep->useroffset &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7c8298c17145..284954ef1da5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -12,6 +12,7 @@
 #include <linux/memory.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
+#include <linux/kfence.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
@@ -430,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	rcu_barrier();
 
 	list_for_each_entry_safe(s, s2, &to_destroy, list) {
+		kfence_shutdown_cache(s);
 #ifdef SLAB_SUPPORTS_SYSFS
 		sysfs_slab_release(s);
 #else
@@ -455,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s)
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
 		schedule_work(&slab_caches_to_rcu_destroy_work);
 	} else {
+		kfence_shutdown_cache(s);
 #ifdef SLAB_SUPPORTS_SYSFS
 		sysfs_slab_unlink(s);
 		sysfs_slab_release(s);
@@ -1235,7 +1238,7 @@ size_t ksize(const void *objp)
 	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
 		return 0;
 
-	size = __ksize(objp);
+	size = kfence_ksize(objp) ?: __ksize(objp);
 	/*
 	 * We assume that ksize callers could use whole allocated area,
 	 * so we need to unpoison this area.
-- 
cgit v1.2.3


From b89fb5ef0ce611b5db8eb9d3a5a7fcaab2cbe9e4 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:19:16 -0800
Subject: mm, kfence: insert KFENCE hooks for SLUB

Inserts KFENCE hooks into the SLUB allocator.

To pass the originally requested size to KFENCE, add an argument
'orig_size' to slab_alloc*(). The additional argument is required to
preserve the requested original size for kmalloc() allocations, which
uses size classes (e.g. an allocation of 272 bytes will return an object
of size 512). Therefore, kmem_cache::size does not represent the
kmalloc-caller's requested size, and we must introduce the argument
'orig_size' to propagate the originally requested size to KFENCE.

Without the originally requested size, we would not be able to detect
out-of-bounds accesses for objects placed at the end of a KFENCE object
page if that object is not equal to the kmalloc-size class it was
bucketed into.

When KFENCE is disabled, there is no additional overhead, since
slab_alloc*() functions are __always_inline.

Link: https://lkml.kernel.org/r/20201103175841.3495947-6-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Co-developed-by: Marco Elver <elver@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  3 +++
 mm/kfence/core.c         |  2 ++
 mm/slub.c                | 60 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 1be0ed5befa1..dcde82a4434c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -7,6 +7,7 @@
  *
  * (C) 2007 SGI, Christoph Lameter
  */
+#include <linux/kfence.h>
 #include <linux/kobject.h>
 #include <linux/reciprocal_div.h>
 
@@ -185,6 +186,8 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 					const struct page *page, void *obj)
 {
+	if (is_kfence_address(obj))
+		return 0;
 	return __obj_to_index(cache, page_address(page), obj);
 }
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 05c18aa11851..7692af715fdb 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Set required struct page fields. */
 	page = virt_to_page(meta->addr);
 	page->slab_cache = cache;
+	if (IS_ENABLED(CONFIG_SLUB))
+		page->objects = 1;
 	if (IS_ENABLED(CONFIG_SLAB))
 		page->s_mem = addr;
 
diff --git a/mm/slub.c b/mm/slub.c
index b2833ce85c92..383616af28c4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
+#include <linux/kfence.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
@@ -1570,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 	void *old_tail = *tail ? *tail : *head;
 	int rsize;
 
+	if (is_kfence_address(next)) {
+		slab_free_hook(s, next);
+		return true;
+	}
+
 	/* Head and tail of the reconstructed freelist */
 	*head = NULL;
 	*tail = NULL;
@@ -2809,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
  * Otherwise we can simply pick the next object from the lockless free list.
  */
 static __always_inline void *slab_alloc_node(struct kmem_cache *s,
-		gfp_t gfpflags, int node, unsigned long addr)
+		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 {
 	void *object;
 	struct kmem_cache_cpu *c;
@@ -2820,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 	s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
 	if (!s)
 		return NULL;
+
+	object = kfence_alloc(s, orig_size, gfpflags);
+	if (unlikely(object))
+		goto out;
+
 redo:
 	/*
 	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2892,20 +2903,21 @@ redo:
 	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
 		memset(kasan_reset_tag(object), 0, s->object_size);
 
+out:
 	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
 
 	return object;
 }
 
 static __always_inline void *slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, unsigned long addr)
+		gfp_t gfpflags, unsigned long addr, size_t orig_size)
 {
-	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
 }
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
 				s->size, gfpflags);
@@ -2917,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
@@ -2928,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    s->object_size, s->size, gfpflags, node);
@@ -2942,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 				    gfp_t gfpflags,
 				    int node, size_t size)
 {
-	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
@@ -2976,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	stat(s, FREE_SLOWPATH);
 
+	if (kfence_free(head))
+		return;
+
 	if (kmem_cache_debug(s) &&
 	    !free_debug_processing(s, page, head, tail, cnt, addr))
 		return;
@@ -3220,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
 		df->s = cache_from_obj(s, object); /* Support for memcg */
 	}
 
+	if (is_kfence_address(object)) {
+		slab_free_hook(df->s, object);
+		__kfence_free(object);
+		p[size] = NULL; /* mark object processed */
+		return size;
+	}
+
 	/* Start new detached freelist */
 	df->page = page;
 	set_freepointer(df->s, object, NULL);
@@ -3295,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	c = this_cpu_ptr(s->cpu_slab);
 
 	for (i = 0; i < size; i++) {
-		void *object = c->freelist;
+		void *object = kfence_alloc(s, s->object_size, flags);
 
+		if (unlikely(object)) {
+			p[i] = object;
+			continue;
+		}
+
+		object = c->freelist;
 		if (unlikely(!object)) {
 			/*
 			 * We may have removed an object from c->freelist using
@@ -4021,7 +4049,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, flags, _RET_IP_);
+	ret = slab_alloc(s, flags, _RET_IP_, size);
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
@@ -4069,7 +4097,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc_node(s, flags, node, _RET_IP_);
+	ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
@@ -4095,6 +4123,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 	struct kmem_cache *s;
 	unsigned int offset;
 	size_t object_size;
+	bool is_kfence = is_kfence_address(ptr);
 
 	ptr = kasan_reset_tag(ptr);
 
@@ -4107,10 +4136,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 			       to_user, 0, n);
 
 	/* Find offset within object. */
-	offset = (ptr - page_address(page)) % s->size;
+	if (is_kfence)
+		offset = ptr - kfence_object_start(ptr);
+	else
+		offset = (ptr - page_address(page)) % s->size;
 
 	/* Adjust for redzone and reject if within the redzone. */
-	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
 		if (offset < s->red_left_pad)
 			usercopy_abort("SLUB object in left red zone",
 				       s->name, to_user, offset, n);
@@ -4527,7 +4559,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, gfpflags, caller);
+	ret = slab_alloc(s, gfpflags, caller, size);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4558,7 +4590,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc_node(s, gfpflags, node, caller);
+	ret = slab_alloc_node(s, gfpflags, node, caller, size);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
-- 
cgit v1.2.3


From bc8fbc5f305aecf63423da91e5faf4c0ce40bf38 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 25 Feb 2021 17:19:31 -0800
Subject: kfence: add test suite

Add KFENCE test suite, testing various error detection scenarios. Makes
use of KUnit for test organization. Since KFENCE's interface to obtain
error reports is via the console, the test verifies that KFENCE outputs
expected reports to the console.

[elver@google.com: fix typo in test]
  Link: https://lkml.kernel.org/r/X9lHQExmHGvETxY4@elver.google.com
[elver@google.com: show access type in report]
  Link: https://lkml.kernel.org/r/20210111091544.3287013-2-elver@google.com

Link: https://lkml.kernel.org/r/20201103175841.3495947-9-elver@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Co-developed-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kfence.rst |  12 +-
 arch/arm64/mm/fault.c              |   2 +-
 arch/x86/mm/fault.c                |   3 +-
 include/linux/kfence.h             |   9 +-
 lib/Kconfig.kfence                 |  13 +
 mm/kfence/Makefile                 |   3 +
 mm/kfence/core.c                   |  11 +-
 mm/kfence/kfence.h                 |   2 +-
 mm/kfence/kfence_test.c            | 858 +++++++++++++++++++++++++++++++++++++
 mm/kfence/report.c                 |  27 +-
 10 files changed, 915 insertions(+), 25 deletions(-)
 create mode 100644 mm/kfence/kfence_test.c

(limited to 'include')

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0e2fb6ef3016..58a0a5fa1ddc 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -65,9 +65,9 @@ Error reports
 A typical out-of-bounds access looks like this::
 
     ==================================================================
-    BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b
+    BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b
 
-    Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17):
+    Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17):
      test_out_of_bounds_read+0xa3/0x22b
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -94,9 +94,9 @@ its origin. Note that, real kernel addresses are only shown for
 Use-after-free accesses are reported as::
 
     ==================================================================
-    BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143
+    BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
 
-    Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24):
+    Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24):
      test_use_after_free_read+0xb3/0x143
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -193,9 +193,9 @@ where it was not possible to determine an associated object, e.g. if adjacent
 object pages had not yet been allocated::
 
     ==================================================================
-    BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0
+    BUG: KFENCE: invalid read in test_invalid_access+0x26/0xe0
 
-    Invalid access at 0xffffffffb670b00a:
+    Invalid read at 0xffffffffb670b00a:
      test_invalid_access+0x26/0xe0
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 56d9423ca59c..f37d4e3830b7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (kfence_handle_page_fault(addr, regs))
+		if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 			return;
 
 		msg = "paging request";
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 38868b4ce8b0..a73347e2cdfc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -682,7 +682,8 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		efi_crash_gracefully_on_page_fault(address);
 
 	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs))
+	if (!(error_code & X86_PF_PROT) &&
+	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
 		return;
 
 oops:
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 5a56bcf5606c..a70d1ea03532 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
 /**
  * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
  * @addr: faulting address
+ * @is_write: is access a write
  * @regs: current struct pt_regs (can be NULL, but shows full stack trace)
  *
  * Return:
@@ -197,7 +198,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
  * cases KFENCE prints an error message and marks the offending page as
  * present, so that the kernel can proceed.
  */
-bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs);
+bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs);
 
 #else /* CONFIG_KFENCE */
 
@@ -210,7 +211,11 @@ static inline size_t kfence_ksize(const void *addr) { return 0; }
 static inline void *kfence_object_start(const void *addr) { return NULL; }
 static inline void __kfence_free(void *addr) { }
 static inline bool __must_check kfence_free(void *addr) { return false; }
-static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write,
+							 struct pt_regs *regs)
+{
+	return false;
+}
 
 #endif
 
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index 605125ac2ae0..78f50ccb3b45 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -66,4 +66,17 @@ config KFENCE_STRESS_TEST_FAULTS
 
 	  Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
 
+config KFENCE_KUNIT_TEST
+	tristate "KFENCE integration test suite" if !KUNIT_ALL_TESTS
+	default KUNIT_ALL_TESTS
+	depends on TRACEPOINTS && KUNIT
+	help
+	  Test suite for KFENCE, testing various error detection scenarios with
+	  various allocation types, and checking that reports are correctly
+	  output to console.
+
+	  Say Y here if you want the test to be built into the kernel and run
+	  during boot; say M if you want the test to build as a module; say N
+	  if you are unsure.
+
 endif # KFENCE
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index d991e9a349f0..6872cd5e5390 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_KFENCE) := core.o report.o
+
+CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7692af715fdb..cfe3d32ac5b7 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr)
 		return true;
 
 	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-	kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr),
+	kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
 			    KFENCE_ERROR_CORRUPTION);
 	return false;
 }
@@ -355,7 +355,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
 		/* Invalid or double-free, bail out. */
 		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-		kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE);
+		kfence_report_error((unsigned long)addr, false, NULL, meta,
+				    KFENCE_ERROR_INVALID_FREE);
 		raw_spin_unlock_irqrestore(&meta->lock, flags);
 		return;
 	}
@@ -770,7 +771,7 @@ void __kfence_free(void *addr)
 		kfence_guarded_free(addr, meta, false);
 }
 
-bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
+bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
 {
 	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
 	struct kfence_metadata *to_report = NULL;
@@ -833,11 +834,11 @@ bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
 
 out:
 	if (to_report) {
-		kfence_report_error(addr, regs, to_report, error_type);
+		kfence_report_error(addr, is_write, regs, to_report, error_type);
 		raw_spin_unlock_irqrestore(&to_report->lock, flags);
 	} else {
 		/* This may be a UAF or OOB access, but we can't be sure. */
-		kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID);
+		kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
 	}
 
 	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 0d83e628a97d..1accc840dbbe 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -105,7 +105,7 @@ enum kfence_error_type {
 	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
 };
 
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
 			 const struct kfence_metadata *meta, enum kfence_error_type type);
 
 void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
new file mode 100644
index 000000000000..db1bb596acaf
--- /dev/null
+++ b/mm/kfence/kfence_test.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KFENCE memory safety error detector. Since the interface with
+ * which KFENCE's reports are obtained is via the console, this is the output we
+ * should verify. For each test case checks the presence (or absence) of
+ * generated reports. Relies on 'console' tracepoint to capture reports as they
+ * appear in the kernel log.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ *         Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+#include "kfence.h"
+
+/* Report as observed from console. */
+static struct {
+	spinlock_t lock;
+	int nlines;
+	char lines[2][256];
+} observed = {
+	.lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+	unsigned long flags;
+	int nlines;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	nlines = observed.nlines;
+
+	if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
+		/*
+		 * KFENCE report and related to the test.
+		 *
+		 * The provided @buf is not NUL-terminated; copy no more than
+		 * @len bytes and let strscpy() add the missing NUL-terminator.
+		 */
+		strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+		nlines = 1;
+	} else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) {
+		strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+	}
+
+	WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+	spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+	return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+	enum kfence_error_type type; /* The type or error. */
+	void *fn; /* Function pointer to expected function where access occurred. */
+	char *addr; /* Address at which the bad access occurred. */
+	bool is_write; /* Is access a write. */
+};
+
+static const char *get_access_type(const struct expect_report *r)
+{
+	return r->is_write ? "write" : "read";
+}
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+	bool ret = false;
+	unsigned long flags;
+	typeof(observed.lines) expect;
+	const char *end;
+	char *cur;
+
+	/* Doubled-checked locking. */
+	if (!report_available())
+		return false;
+
+	/* Generate expected report contents. */
+
+	/* Title */
+	cur = expect[0];
+	end = &expect[0][sizeof(expect[0]) - 1];
+	switch (r->type) {
+	case KFENCE_ERROR_OOB:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_UAF:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
+		break;
+	case KFENCE_ERROR_INVALID:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
+		break;
+	}
+
+	scnprintf(cur, end - cur, " in %pS", r->fn);
+	/* The exact offset won't match, remove it; also strip module name. */
+	cur = strchr(expect[0], '+');
+	if (cur)
+		*cur = '\0';
+
+	/* Access information */
+	cur = expect[1];
+	end = &expect[1][sizeof(expect[1]) - 1];
+
+	switch (r->type) {
+	case KFENCE_ERROR_OOB:
+		cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_UAF:
+		cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		cur += scnprintf(cur, end - cur, "Corrupted memory at");
+		break;
+	case KFENCE_ERROR_INVALID:
+		cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		cur += scnprintf(cur, end - cur, "Invalid free of");
+		break;
+	}
+
+	cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr);
+
+	spin_lock_irqsave(&observed.lock, flags);
+	if (!report_available())
+		goto out; /* A new report is being captured. */
+
+	/* Finally match expected output to what we actually observed. */
+	ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
+out:
+	spin_unlock_irqrestore(&observed.lock, flags);
+	return ret;
+}
+
+/* ===== Test cases ===== */
+
+#define TEST_PRIV_WANT_MEMCACHE ((void *)1)
+
+/* Cache used by tests; if NULL, allocate from kmalloc instead. */
+static struct kmem_cache *test_cache;
+
+static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags,
+			       void (*ctor)(void *))
+{
+	if (test->priv != TEST_PRIV_WANT_MEMCACHE)
+		return size;
+
+	kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
+
+	/*
+	 * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any
+	 * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to
+	 * allocate via memcg, if enabled.
+	 */
+	flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT;
+	test_cache = kmem_cache_create("test", size, 1, flags, ctor);
+	KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
+
+	return size;
+}
+
+static void test_cache_destroy(void)
+{
+	if (!test_cache)
+		return;
+
+	kmem_cache_destroy(test_cache);
+	test_cache = NULL;
+}
+
+static inline size_t kmalloc_cache_alignment(size_t size)
+{
+	return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+}
+
+/* Must always inline to match stack trace against caller. */
+static __always_inline void test_free(void *ptr)
+{
+	if (test_cache)
+		kmem_cache_free(test_cache, ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * If this should be a KFENCE allocation, and on which side the allocation and
+ * the closest guard page should be.
+ */
+enum allocation_policy {
+	ALLOCATE_ANY, /* KFENCE, any side. */
+	ALLOCATE_LEFT, /* KFENCE, left side of page. */
+	ALLOCATE_RIGHT, /* KFENCE, right side of page. */
+	ALLOCATE_NONE, /* No KFENCE allocation. */
+};
+
+/*
+ * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the
+ * current test_cache if set up.
+ */
+static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy)
+{
+	void *alloc;
+	unsigned long timeout, resched_after;
+	const char *policy_name;
+
+	switch (policy) {
+	case ALLOCATE_ANY:
+		policy_name = "any";
+		break;
+	case ALLOCATE_LEFT:
+		policy_name = "left";
+		break;
+	case ALLOCATE_RIGHT:
+		policy_name = "right";
+		break;
+	case ALLOCATE_NONE:
+		policy_name = "none";
+		break;
+	}
+
+	kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+		   policy_name, !!test_cache);
+
+	/*
+	 * 100x the sample interval should be more than enough to ensure we get
+	 * a KFENCE allocation eventually.
+	 */
+	timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+	/*
+	 * Especially for non-preemption kernels, ensure the allocation-gate
+	 * timer can catch up: after @resched_after, every failed allocation
+	 * attempt yields, to ensure the allocation-gate timer is scheduled.
+	 */
+	resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+	do {
+		if (test_cache)
+			alloc = kmem_cache_alloc(test_cache, gfp);
+		else
+			alloc = kmalloc(size, gfp);
+
+		if (is_kfence_address(alloc)) {
+			struct page *page = virt_to_head_page(alloc);
+			struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+
+			/*
+			 * Verify that various helpers return the right values
+			 * even for KFENCE objects; these are required so that
+			 * memcg accounting works correctly.
+			 */
+			KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
+			KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
+
+			if (policy == ALLOCATE_ANY)
+				return alloc;
+			if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+				return alloc;
+			if (policy == ALLOCATE_RIGHT &&
+			    !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+				return alloc;
+		} else if (policy == ALLOCATE_NONE)
+			return alloc;
+
+		test_free(alloc);
+
+		if (time_after(jiffies, resched_after))
+			cond_resched();
+	} while (time_before(jiffies, timeout));
+
+	KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE");
+	return NULL; /* Unreachable. */
+}
+
+static void test_out_of_bounds_read(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_out_of_bounds_read,
+		.is_write = false,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+
+	/*
+	 * If we don't have our own cache, adjust based on alignment, so that we
+	 * actually access guard pages on either side.
+	 */
+	if (!test_cache)
+		size = kmalloc_cache_alignment(size);
+
+	/* Test both sides. */
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf - 1;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	expect.addr = buf + size;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+}
+
+static void test_out_of_bounds_write(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_out_of_bounds_write,
+		.is_write = true,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf - 1;
+	WRITE_ONCE(*expect.addr, 42);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+}
+
+static void test_use_after_free_read(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_use_after_free_read,
+		.is_write = false,
+	};
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	test_free(expect.addr);
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_double_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID_FREE,
+		.fn = test_double_free,
+	};
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	test_free(expect.addr);
+	test_free(expect.addr); /* Double-free. */
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_invalid_addr_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID_FREE,
+		.fn = test_invalid_addr_free,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	expect.addr = buf + 1; /* Free on invalid address. */
+	test_free(expect.addr); /* Invalid address free. */
+	test_free(buf); /* No error. */
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_corruption(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_CORRUPTION,
+		.fn = test_corruption,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+
+	/* Test both sides. */
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf + size;
+	WRITE_ONCE(*expect.addr, 42);
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	expect.addr = buf - 1;
+	WRITE_ONCE(*expect.addr, 42);
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * KFENCE is unable to detect an OOB if the allocation's alignment requirements
+ * leave a gap between the object and the guard page. Specifically, an
+ * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB
+ * respectively. Therefore it is impossible for the allocated object to
+ * contiguously line up with the right guard page.
+ *
+ * However, we test that an access to memory beyond the gap results in KFENCE
+ * detecting an OOB access.
+ */
+static void test_kmalloc_aligned_oob_read(struct kunit *test)
+{
+	const size_t size = 73;
+	const size_t align = kmalloc_cache_alignment(size);
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_kmalloc_aligned_oob_read,
+		.is_write = false,
+	};
+	char *buf;
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+
+	/*
+	 * The object is offset to the right, so there won't be an OOB to the
+	 * left of it.
+	 */
+	READ_ONCE(*(buf - 1));
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/*
+	 * @buf must be aligned on @align, therefore buf + size belongs to the
+	 * same page -> no OOB.
+	 */
+	READ_ONCE(*(buf + size));
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/* Overflowing by @align bytes will result in an OOB. */
+	expect.addr = buf + size + align;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+	test_free(buf);
+}
+
+static void test_kmalloc_aligned_oob_write(struct kunit *test)
+{
+	const size_t size = 73;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_CORRUPTION,
+		.fn = test_kmalloc_aligned_oob_write,
+	};
+	char *buf;
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	/*
+	 * The object is offset to the right, so we won't get a page
+	 * fault immediately after it.
+	 */
+	expect.addr = buf + size;
+	WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1);
+	KUNIT_EXPECT_FALSE(test, report_available());
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test cache shrinking and destroying with KFENCE. */
+static void test_shrink_memcache(struct kunit *test)
+{
+	const size_t size = 32;
+	void *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	kmem_cache_shrink(test_cache);
+	test_free(buf);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void ctor_set_x(void *obj)
+{
+	/* Every object has at least 8 bytes. */
+	memset(obj, 'x', 8);
+}
+
+/* Ensure that SL*B does not modify KFENCE objects on bulk free. */
+static void test_free_bulk(struct kunit *test)
+{
+	int iter;
+
+	for (iter = 0; iter < 5; iter++) {
+		const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0,
+						     (iter & 1) ? ctor_set_x : NULL);
+		void *objects[] = {
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+		};
+
+		kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
+		KUNIT_ASSERT_FALSE(test, report_available());
+		test_cache_destroy();
+	}
+}
+
+/* Test init-on-free works. */
+static void test_init_on_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_init_on_free,
+		.is_write = false,
+	};
+	int i;
+
+	if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
+		return;
+	/* Assume it hasn't been disabled on command line. */
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	for (i = 0; i < size; i++)
+		expect.addr[i] = i + 1;
+	test_free(expect.addr);
+
+	for (i = 0; i < size; i++) {
+		/*
+		 * This may fail if the page was recycled by KFENCE and then
+		 * written to again -- this however, is near impossible with a
+		 * default config.
+		 */
+		KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0);
+
+		if (!i) /* Only check first access to not fail test if page is ever re-protected. */
+			KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	}
+}
+
+/* Ensure that constructors work properly. */
+static void test_memcache_ctor(struct kunit *test)
+{
+	const size_t size = 32;
+	char *buf;
+	int i;
+
+	setup_test_cache(test, size, 0, ctor_set_x);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+
+	for (i = 0; i < 8; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)'x');
+
+	test_free(buf);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/* Test that memory is zeroed if requested. */
+static void test_gfpzero(struct kunit *test)
+{
+	const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */
+	char *buf1, *buf2;
+	int i;
+
+	if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
+		kunit_warn(test, "skipping ... would take too long\n");
+		return;
+	}
+
+	setup_test_cache(test, size, 0, NULL);
+	buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	for (i = 0; i < size; i++)
+		buf1[i] = i + 1;
+	test_free(buf1);
+
+	/* Try to get same address again -- this can take a while. */
+	for (i = 0;; i++) {
+		buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY);
+		if (buf1 == buf2)
+			break;
+		test_free(buf2);
+
+		if (i == CONFIG_KFENCE_NUM_OBJECTS) {
+			kunit_warn(test, "giving up ... cannot get same object back\n");
+			return;
+		}
+	}
+
+	for (i = 0; i < size; i++)
+		KUNIT_EXPECT_EQ(test, buf2[i], (char)0);
+
+	test_free(buf2);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void test_invalid_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID,
+		.fn = test_invalid_access,
+		.addr = &__kfence_pool[10],
+		.is_write = false,
+	};
+
+	READ_ONCE(__kfence_pool[10]);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test SLAB_TYPESAFE_BY_RCU works. */
+static void test_memcache_typesafe_by_rcu(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_memcache_typesafe_by_rcu,
+		.is_write = false,
+	};
+
+	setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	*expect.addr = 42;
+
+	rcu_read_lock();
+	test_free(expect.addr);
+	KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+	/*
+	 * Up to this point, memory should not have been freed yet, and
+	 * therefore there should be no KFENCE report from the above access.
+	 */
+	rcu_read_unlock();
+
+	/* Above access to @expect.addr should not have generated a report! */
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/* Only after rcu_barrier() is the memory guaranteed to be freed. */
+	rcu_barrier();
+
+	/* Expect use-after-free. */
+	KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test krealloc(). */
+static void test_krealloc(struct kunit *test)
+{
+	const size_t size = 32;
+	const struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_krealloc,
+		.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY),
+		.is_write = false,
+	};
+	char *buf = expect.addr;
+	int i;
+
+	KUNIT_EXPECT_FALSE(test, test_cache);
+	KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */
+	for (i = 0; i < size; i++)
+		buf[i] = i + 1;
+
+	/* Check that we successfully change the size. */
+	buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */
+	/* Note: Might no longer be a KFENCE alloc. */
+	KUNIT_EXPECT_GE(test, ksize(buf), size * 3);
+	for (i = 0; i < size; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+	for (; i < size * 3; i++) /* Fill to extra bytes. */
+		buf[i] = i + 1;
+
+	buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */
+	KUNIT_EXPECT_GE(test, ksize(buf), size * 2);
+	for (i = 0; i < size * 2; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+
+	buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */
+	KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR);
+	KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */
+
+	READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */
+	KUNIT_ASSERT_TRUE(test, report_matches(&expect));
+}
+
+/* Test that some objects from a bulk allocation belong to KFENCE pool. */
+static void test_memcache_alloc_bulk(struct kunit *test)
+{
+	const size_t size = 32;
+	bool pass = false;
+	unsigned long timeout;
+
+	setup_test_cache(test, size, 0, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+	/*
+	 * 100x the sample interval should be more than enough to ensure we get
+	 * a KFENCE allocation eventually.
+	 */
+	timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+	do {
+		void *objects[100];
+		int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
+						   objects);
+		if (!num)
+			continue;
+		for (i = 0; i < ARRAY_SIZE(objects); i++) {
+			if (is_kfence_address(objects[i])) {
+				pass = true;
+				break;
+			}
+		}
+		kmem_cache_free_bulk(test_cache, num, objects);
+		/*
+		 * kmem_cache_alloc_bulk() disables interrupts, and calling it
+		 * in a tight loop may not give KFENCE a chance to switch the
+		 * static branch. Call cond_resched() to let KFENCE chime in.
+		 */
+		cond_resched();
+	} while (!pass && time_before(jiffies, timeout));
+
+	KUNIT_EXPECT_TRUE(test, pass);
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/*
+ * KUnit does not provide a way to provide arguments to tests, and we encode
+ * additional info in the name. Set up 2 tests per test case, one using the
+ * default allocator, and another using a custom memcache (suffix '-memcache').
+ */
+#define KFENCE_KUNIT_CASE(test_name)						\
+	{ .run_case = test_name, .name = #test_name },				\
+	{ .run_case = test_name, .name = #test_name "-memcache" }
+
+static struct kunit_case kfence_test_cases[] = {
+	KFENCE_KUNIT_CASE(test_out_of_bounds_read),
+	KFENCE_KUNIT_CASE(test_out_of_bounds_write),
+	KFENCE_KUNIT_CASE(test_use_after_free_read),
+	KFENCE_KUNIT_CASE(test_double_free),
+	KFENCE_KUNIT_CASE(test_invalid_addr_free),
+	KFENCE_KUNIT_CASE(test_corruption),
+	KFENCE_KUNIT_CASE(test_free_bulk),
+	KFENCE_KUNIT_CASE(test_init_on_free),
+	KUNIT_CASE(test_kmalloc_aligned_oob_read),
+	KUNIT_CASE(test_kmalloc_aligned_oob_write),
+	KUNIT_CASE(test_shrink_memcache),
+	KUNIT_CASE(test_memcache_ctor),
+	KUNIT_CASE(test_invalid_access),
+	KUNIT_CASE(test_gfpzero),
+	KUNIT_CASE(test_memcache_typesafe_by_rcu),
+	KUNIT_CASE(test_krealloc),
+	KUNIT_CASE(test_memcache_alloc_bulk),
+	{},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	for (i = 0; i < ARRAY_SIZE(observed.lines); i++)
+		observed.lines[i][0] = '\0';
+	observed.nlines = 0;
+	spin_unlock_irqrestore(&observed.lock, flags);
+
+	/* Any test with 'memcache' in its name will want a memcache. */
+	if (strstr(test->name, "memcache"))
+		test->priv = TEST_PRIV_WANT_MEMCACHE;
+	else
+		test->priv = NULL;
+
+	return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+	test_cache_destroy();
+}
+
+static struct kunit_suite kfence_test_suite = {
+	.name = "kfence",
+	.test_cases = kfence_test_cases,
+	.init = test_init,
+	.exit = test_exit,
+};
+static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	check_trace_callback_type_console(probe_console);
+	if (!strcmp(tp->name, "console"))
+		WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	if (!strcmp(tp->name, "console"))
+		tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kfence_test_init(void)
+{
+	/*
+	 * Because we want to be able to build the test as a module, we need to
+	 * iterate through all known tracepoints, since the static registration
+	 * won't work here.
+	 */
+	for_each_kernel_tracepoint(register_tracepoints, NULL);
+	return __kunit_test_suites_init(kfence_test_suites);
+}
+
+static void kfence_test_exit(void)
+{
+	__kunit_test_suites_exit(kfence_test_suites);
+	for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+	tracepoint_synchronize_unregister();
+}
+
+late_initcall(kfence_test_init);
+module_exit(kfence_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 4dbfa9a382e4..901bd7ee83d8 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -156,7 +156,12 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
 	pr_cont(" ]");
 }
 
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+static const char *get_access_type(bool is_write)
+{
+	return is_write ? "write" : "read";
+}
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
 			 const struct kfence_metadata *meta, enum kfence_error_type type)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
@@ -194,17 +199,19 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
 	case KFENCE_ERROR_OOB: {
 		const bool left_of_object = address < meta->addr;
 
-		pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
-		       (void *)address,
+		pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+		       get_access_type(is_write), (void *)address,
 		       left_of_object ? meta->addr - address : address - meta->addr,
 		       left_of_object ? "left" : "right", object_index);
 		break;
 	}
 	case KFENCE_ERROR_UAF:
-		pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
-		       (void *)address, object_index);
+		pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n",
+		       get_access_type(is_write), (void *)address, object_index);
 		break;
 	case KFENCE_ERROR_CORRUPTION:
 		pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
@@ -213,8 +220,10 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
 		pr_cont(" (in kfence-#%zd):\n", object_index);
 		break;
 	case KFENCE_ERROR_INVALID:
-		pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+		pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write),
+		       (void *)address);
 		break;
 	case KFENCE_ERROR_INVALID_FREE:
 		pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
-- 
cgit v1.2.3


From 9c0dee54eb91d48cca048bd7bd2c1f4a166e0252 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:19:44 -0800
Subject: tracing: add error_report_end trace point

Patch series "Add error_report_end tracepoint to KFENCE and KASAN", v3.

This patchset adds a tracepoint, error_repor_end, that is to be used by
KFENCE, KASAN, and potentially other bug detection tools, when they print
an error report.  One of the possible use cases is userspace collection of
kernel error reports: interested parties can subscribe to the tracing
event via tracefs, and get notified when an error report occurs.

This patch (of 3):

Introduce error_report_end tracepoint.  It can be used in debugging tools
like KASAN, KFENCE, etc.  to provide extensions to the error reporting
mechanisms (e.g.  allow tests hook into error reporting, ease error report
collection from production kernels).  Another benefit would be making use
of ftrace for debugging or benchmarking the tools themselves.

Should we need it, the tracepoint name leaves us with the possibility to
introduce a complementary error_report_start tracepoint in the future.

Link: https://lkml.kernel.org/r/20210121131915.1331302-1-glider@google.com
Link: https://lkml.kernel.org/r/20210121131915.1331302-2-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Suggested-by: Marco Elver <elver@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/error_report.h | 74 +++++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile               |  1 +
 kernel/trace/error_report-traces.c  | 11 ++++++
 3 files changed, 86 insertions(+)
 create mode 100644 include/trace/events/error_report.h
 create mode 100644 kernel/trace/error_report-traces.c

(limited to 'include')

diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h
new file mode 100644
index 000000000000..96f64bf218b2
--- /dev/null
+++ b/include/trace/events/error_report.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Declarations for error reporting tracepoints.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM error_report
+
+#if !defined(_TRACE_ERROR_REPORT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ERROR_REPORT_H
+
+#include <linux/tracepoint.h>
+
+#ifndef __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY
+#define __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY
+
+enum error_detector {
+	ERROR_DETECTOR_KFENCE,
+	ERROR_DETECTOR_KASAN
+};
+
+#endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */
+
+#define error_detector_list	\
+	EM(ERROR_DETECTOR_KFENCE, "kfence")	\
+	EMe(ERROR_DETECTOR_KASAN, "kasan")
+/* Always end the list with an EMe. */
+
+#undef EM
+#undef EMe
+
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+error_detector_list
+
+#undef EM
+#undef EMe
+
+#define EM(a, b) { a, b },
+#define EMe(a, b) { a, b }
+
+#define show_error_detector_list(val) \
+	__print_symbolic(val, error_detector_list)
+
+DECLARE_EVENT_CLASS(error_report_template,
+		    TP_PROTO(enum error_detector error_detector, unsigned long id),
+		    TP_ARGS(error_detector, id),
+		    TP_STRUCT__entry(__field(enum error_detector, error_detector)
+					     __field(unsigned long, id)),
+		    TP_fast_assign(__entry->error_detector = error_detector;
+				   __entry->id = id;),
+		    TP_printk("[%s] %lx",
+			      show_error_detector_list(__entry->error_detector),
+			      __entry->id));
+
+/**
+ * error_report_end - called after printing the error report
+ * @error_detector:	short string describing the error detection tool
+ * @id:			pseudo-unique descriptor identifying the report
+ *			(e.g. the memory access address)
+ *
+ * This event occurs right after a debugging tool finishes printing the error
+ * report.
+ */
+DEFINE_EVENT(error_report_template, error_report_end,
+	     TP_PROTO(enum error_detector error_detector, unsigned long id),
+	     TP_ARGS(error_detector, id));
+
+#endif /* _TRACE_ERROR_REPORT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7e44cea89fdc..b28d3e5013cd 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
 obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
 obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c
new file mode 100644
index 000000000000..f89792c25b11
--- /dev/null
+++ b/kernel/trace/error_report-traces.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Error reporting trace points.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/error_report.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end);
-- 
cgit v1.2.3


From 928501344fc645f80390afc12708c81b3595745d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 25 Feb 2021 17:19:55 -0800
Subject: kasan, mm: don't save alloc stacks twice

Patch series "kasan: optimizations and fixes for HW_TAGS", v4.

This patchset makes the HW_TAGS mode more efficient, mostly by reworking
poisoning approaches and simplifying/inlining some internal helpers.

With this change, the overhead of HW_TAGS annotations excluding setting
and checking memory tags is ~3%.  The performance impact caused by tags
will be unknown until we have hardware that supports MTE.

As a side-effect, this patchset speeds up generic KASAN by ~15%.

This patch (of 13):

Currently KASAN saves allocation stacks in both kasan_slab_alloc() and
kasan_kmalloc() annotations.  This patch changes KASAN to save allocation
stacks for slab objects from kmalloc caches in kasan_kmalloc() only, and
stacks for other slab objects in kasan_slab_alloc() only.

This change requires ____kasan_kmalloc() knowing whether the object
belongs to a kmalloc cache.  This is implemented by adding a flag field to
the kasan_info structure.  That flag is only set for kmalloc caches via a
new kasan_cache_create_kmalloc() annotation.

Link: https://lkml.kernel.org/r/cover.1612546384.git.andreyknvl@google.com
Link: https://lkml.kernel.org/r/7c673ebca8d00f40a7ad6f04ab9a2bddeeae2097.1612546384.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  9 +++++++++
 mm/kasan/common.c     | 18 ++++++++++++++----
 mm/slab_common.c      |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7eaf2d9effb4..3fb31e8a353e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -83,6 +83,7 @@ static inline void kasan_disable_current(void) {}
 struct kasan_cache {
 	int alloc_meta_offset;
 	int free_meta_offset;
+	bool is_kmalloc;
 };
 
 #ifdef CONFIG_KASAN_HW_TAGS
@@ -143,6 +144,13 @@ static __always_inline void kasan_cache_create(struct kmem_cache *cache,
 		__kasan_cache_create(cache, size, flags);
 }
 
+void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
+static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
+{
+	if (kasan_enabled())
+		__kasan_cache_create_kmalloc(cache);
+}
+
 size_t __kasan_metadata_size(struct kmem_cache *cache);
 static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
 {
@@ -278,6 +286,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
+static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index af1768c4fee5..d8d83ca56fe2 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -210,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
 		*size = optimal_size;
 }
 
+void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
+{
+	cache->kasan_info.is_kmalloc = true;
+}
+
 size_t __kasan_metadata_size(struct kmem_cache *cache)
 {
 	if (!kasan_stack_collection_enabled())
@@ -394,17 +399,22 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 	}
 }
 
-static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+static void set_alloc_info(struct kmem_cache *cache, void *object,
+				gfp_t flags, bool is_kmalloc)
 {
 	struct kasan_alloc_meta *alloc_meta;
 
+	/* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */
+	if (cache->kasan_info.is_kmalloc && !is_kmalloc)
+		return;
+
 	alloc_meta = kasan_get_alloc_meta(cache, object);
 	if (alloc_meta)
 		kasan_set_track(&alloc_meta->alloc_track, flags);
 }
 
 static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
-				size_t size, gfp_t flags, bool keep_tag)
+				size_t size, gfp_t flags, bool is_kmalloc)
 {
 	unsigned long redzone_start;
 	unsigned long redzone_end;
@@ -423,7 +433,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				KASAN_GRANULE_SIZE);
 	redzone_end = round_up((unsigned long)object + cache->object_size,
 				KASAN_GRANULE_SIZE);
-	tag = assign_tag(cache, object, false, keep_tag);
+	tag = assign_tag(cache, object, false, is_kmalloc);
 
 	/* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
 	kasan_unpoison(set_tag(object, tag), size);
@@ -431,7 +441,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 			   KASAN_KMALLOC_REDZONE);
 
 	if (kasan_stack_collection_enabled())
-		set_alloc_info(cache, (void *)object, flags);
+		set_alloc_info(cache, (void *)object, flags, is_kmalloc);
 
 	return set_tag(object, tag);
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 284954ef1da5..897c3a446b04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -643,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
 		panic("Out of memory when creating slab %s\n", name);
 
 	create_boot_cache(s, name, size, flags, useroffset, usersize);
+	kasan_cache_create_kmalloc(s);
 	list_add(&s->list, &slab_caches);
 	s->refcount = 1;
 	return s;
-- 
cgit v1.2.3


From 200072ce33b298cf14d3ed2a570f5eb27609677d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 25 Feb 2021 17:20:11 -0800
Subject: kasan: unify large kfree checks

Unify checks in kasan_kfree_large() and in kasan_slab_free_mempool() for
large allocations as it's done for small kfree() allocations.

With this change, kasan_slab_free_mempool() starts checking that the first
byte of the memory that's being freed is accessible.

Link: https://lkml.kernel.org/r/14ffc4cd867e0b1ed58f7527e3b748a1b4ad08aa.1612546384.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 mm/kasan/common.c     | 36 ++++++++++++++++++++++++++----------
 2 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3fb31e8a353e..b91732bd05d7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -200,6 +200,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 	return false;
 }
 
+void __kasan_kfree_large(void *ptr, unsigned long ip);
+static __always_inline void kasan_kfree_large(void *ptr)
+{
+	if (kasan_enabled())
+		__kasan_kfree_large(ptr, _RET_IP_);
+}
+
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
 static __always_inline void kasan_slab_free_mempool(void *ptr)
 {
@@ -247,13 +254,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_kfree_large(void *ptr, unsigned long ip);
-static __always_inline void kasan_kfree_large(void *ptr)
-{
-	if (kasan_enabled())
-		__kasan_kfree_large(ptr, _RET_IP_);
-}
-
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -302,6 +302,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
+static inline void kasan_kfree_large(void *ptr) {}
 static inline void kasan_slab_free_mempool(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
@@ -322,7 +323,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_kfree_large(void *ptr) {}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 48d51daeda95..8a3d66393dc5 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -364,6 +364,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 	return ____kasan_slab_free(cache, object, ip, true);
 }
 
+static bool ____kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	if (ptr != page_address(virt_to_head_page(ptr))) {
+		kasan_report_invalid_free(ptr, ip);
+		return true;
+	}
+
+	if (!kasan_byte_accessible(ptr)) {
+		kasan_report_invalid_free(ptr, ip);
+		return true;
+	}
+
+	/*
+	 * The object will be poisoned by kasan_free_pages() or
+	 * kasan_slab_free_mempool().
+	 */
+
+	return false;
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	____kasan_kfree_large(ptr, ip);
+}
+
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 {
 	struct page *page;
@@ -377,10 +402,8 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
 	 */
 	if (unlikely(!PageSlab(page))) {
-		if (ptr != page_address(page)) {
-			kasan_report_invalid_free(ptr, ip);
+		if (____kasan_kfree_large(ptr, ip))
 			return;
-		}
 		kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
 		____kasan_slab_free(page->slab_cache, ptr, ip, false);
@@ -539,13 +562,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 		return ____kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 
-void __kasan_kfree_large(void *ptr, unsigned long ip)
-{
-	if (ptr != page_address(virt_to_head_page(ptr)))
-		kasan_report_invalid_free(ptr, ip);
-	/* The object will be poisoned by kasan_free_pages(). */
-}
-
 bool __kasan_check_byte(const void *address, unsigned long ip)
 {
 	if (!kasan_byte_accessible(address)) {
-- 
cgit v1.2.3


From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 25 Feb 2021 17:20:56 -0800
Subject: include/linux: remove repeated words

Drop the doubled word "for" in a comment. {firewire-cdev.h}
Drop the doubled word "in" in a comment. {input.h}
Drop the doubled word "a" in a comment. {mdev.h}
Drop the doubled word "the" in a comment. {ptrace.h}

Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mdev.h               | 2 +-
 include/linux/ptrace.h             | 2 +-
 include/uapi/linux/firewire-cdev.h | 2 +-
 include/uapi/linux/input.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 9004375c462e..27eb383cb95d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -42,7 +42,7 @@ struct device *mdev_get_iommu_device(struct device *dev);
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
- * @remove:		Called to free resources in parent device's driver for a
+ * @remove:		Called to free resources in parent device's driver for
  *			a mediated device. It is mandatory to provide 'remove'
  *			ops.
  *			@mdev: mdev_device device structure which is being
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 2a9df80ea887..b5ebf6c01292 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -171,7 +171,7 @@ static inline void ptrace_event(int event, unsigned long message)
  *
  * Check whether @event is enabled and, if so, report @event and @pid
  * to the ptrace parent.  @pid is reported as the pid_t seen from the
- * the ptrace parent's pid namespace.
+ * ptrace parent's pid namespace.
  *
  * Called without locks.
  */
diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h
index 7e5b5c10a49c..5effa9832802 100644
--- a/include/uapi/linux/firewire-cdev.h
+++ b/include/uapi/linux/firewire-cdev.h
@@ -844,7 +844,7 @@ struct fw_cdev_queue_iso {
  * struct fw_cdev_start_iso - Start an isochronous transmission or reception
  * @cycle:	Cycle in which to start I/O.  If @cycle is greater than or
  *		equal to 0, the I/O will start on that cycle.
- * @sync:	Determines the value to wait for for receive packets that have
+ * @sync:	Determines the value to wait for receive packets that have
  *		the %FW_CDEV_ISO_SYNC bit set
  * @tags:	Tag filter bit mask.  Only valid for isochronous reception.
  *		Determines the tag values for which packets will be accepted.
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 9a61c28ed3ae..ee3127461ee0 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -84,7 +84,7 @@ struct input_id {
  * in units per radian.
  * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
  * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
- * in units per g (units/g) and in units per degree per second
+ * units per g (units/g) and in units per degree per second
  * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ).
  */
 struct input_absinfo {
-- 
cgit v1.2.3


From c131bd0b5448bb577b7a9ed48c4e528807e8d5af Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 25 Feb 2021 17:21:00 -0800
Subject: treewide: Miguel has moved

Update contact info.

Link: https://lkml.kernel.org/r/20210206162524.GA11520@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap                                            |  1 +
 CREDITS                                             |  9 +++------
 Documentation/admin-guide/auxdisplay/cfag12864b.rst |  2 +-
 Documentation/admin-guide/auxdisplay/ks0108.rst     |  2 +-
 MAINTAINERS                                         | 12 ++++++------
 drivers/auxdisplay/cfag12864b.c                     |  4 ++--
 drivers/auxdisplay/cfag12864bfb.c                   |  4 ++--
 drivers/auxdisplay/ks0108.c                         |  4 ++--
 include/linux/cfag12864b.h                          |  2 +-
 include/linux/ks0108.h                              |  2 +-
 samples/auxdisplay/cfag12864b-example.c             |  2 +-
 11 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/.mailmap b/.mailmap
index 87a8bbdbf749..85b93cdefc87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -237,6 +237,7 @@ Maxime Ripard <mripard@kernel.org> <maxime.ripard@free-electrons.com>
 Mayuresh Janorkar <mayur@ti.com>
 Michael Buesch <m@bues.ch>
 Michel Dänzer <michel@tungstengraphics.com>
+Miguel Ojeda <ojeda@kernel.org> <miguel.ojeda.sandonis@gmail.com>
 Mike Rapoport <rppt@kernel.org> <mike@compulab.co.il>
 Mike Rapoport <rppt@kernel.org> <mike.rapoport@gmail.com>
 Mike Rapoport <rppt@kernel.org> <rppt@linux.ibm.com>
diff --git a/CREDITS b/CREDITS
index be097156bd71..cef83b958cbe 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2841,14 +2841,11 @@ S: Subiaco, 6008
 S: Perth, Western Australia
 S: Australia
 
-N: Miguel Ojeda Sandonis
-E: miguel.ojeda.sandonis@gmail.com
-W: http://miguelojeda.es
-W: http://jair.lab.fi.uva.es/~migojed/
+N: Miguel Ojeda
+E: ojeda@kernel.org
+W: https://ojeda.dev
 D: Author of the ks0108, cfag12864b and cfag12864bfb auxiliary display drivers.
 D: Maintainer of the auxiliary display drivers tree (drivers/auxdisplay/*)
-S: C/ Mieses 20, 9-B
-S: Valladolid 47009
 S: Spain
 
 N: Peter Oruba
diff --git a/Documentation/admin-guide/auxdisplay/cfag12864b.rst b/Documentation/admin-guide/auxdisplay/cfag12864b.rst
index 18c2865bd322..da385d851acc 100644
--- a/Documentation/admin-guide/auxdisplay/cfag12864b.rst
+++ b/Documentation/admin-guide/auxdisplay/cfag12864b.rst
@@ -3,7 +3,7 @@ cfag12864b LCD Driver Documentation
 ===================================
 
 :License:		GPLv2
-:Author & Maintainer:	Miguel Ojeda Sandonis
+:Author & Maintainer:	Miguel Ojeda <ojeda@kernel.org>
 :Date:			2006-10-27
 
 
diff --git a/Documentation/admin-guide/auxdisplay/ks0108.rst b/Documentation/admin-guide/auxdisplay/ks0108.rst
index c0b7faf73136..a7d3fe509373 100644
--- a/Documentation/admin-guide/auxdisplay/ks0108.rst
+++ b/Documentation/admin-guide/auxdisplay/ks0108.rst
@@ -3,7 +3,7 @@ ks0108 LCD Controller Driver Documentation
 ==========================================
 
 :License:		GPLv2
-:Author & Maintainer:	Miguel Ojeda Sandonis
+:Author & Maintainer:	Miguel Ojeda <ojeda@kernel.org>
 :Date:			2006-10-27
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 40040db747fc..e42082eccf36 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2982,7 +2982,7 @@ F:	include/uapi/linux/audit.h
 F:	kernel/audit*
 
 AUXILIARY DISPLAY DRIVERS
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/
 F:	include/linux/cfag12864b.h
@@ -4128,13 +4128,13 @@ F:	scripts/extract-cert.c
 F:	scripts/sign-file.c
 
 CFAG12864B LCD DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/cfag12864b.c
 F:	include/linux/cfag12864b.h
 
 CFAG12864BFB LCD FRAMEBUFFER DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/cfag12864bfb.c
 F:	include/linux/cfag12864b.h
@@ -4304,7 +4304,7 @@ S:	Supported
 F:	drivers/infiniband/hw/usnic/
 
 CLANG-FORMAT FILE
-M:	Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	.clang-format
 
@@ -4444,7 +4444,7 @@ S:	Maintained
 F:	drivers/platform/x86/compal-laptop.c
 
 COMPILER ATTRIBUTES
-M:	Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	include/linux/compiler_attributes.h
 
@@ -9939,7 +9939,7 @@ F:	include/linux/kprobes.h
 F:	kernel/kprobes.c
 
 KS0108 LCD CONTROLLER DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	Documentation/admin-guide/auxdisplay/ks0108.rst
 F:	drivers/auxdisplay/ks0108.c
diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c
index 7eebae7e322c..fd430e6866a1 100644
--- a/drivers/auxdisplay/cfag12864b.c
+++ b/drivers/auxdisplay/cfag12864b.c
@@ -5,7 +5,7 @@
  * Description: cfag12864b LCD driver
  *     Depends: ks0108
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -376,5 +376,5 @@ module_init(cfag12864b_init);
 module_exit(cfag12864b_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("cfag12864b LCD driver");
diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c
index 2002291ab338..d66821adf453 100644
--- a/drivers/auxdisplay/cfag12864bfb.c
+++ b/drivers/auxdisplay/cfag12864bfb.c
@@ -5,7 +5,7 @@
  * Description: cfag12864b LCD framebuffer driver
  *     Depends: cfag12864b
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -171,5 +171,5 @@ module_init(cfag12864bfb_init);
 module_exit(cfag12864bfb_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("cfag12864b LCD framebuffer driver");
diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c
index abfe3fa9e6f4..03c95ad4216c 100644
--- a/drivers/auxdisplay/ks0108.c
+++ b/drivers/auxdisplay/ks0108.c
@@ -5,7 +5,7 @@
  * Description: ks0108 LCD Controller driver
  *     Depends: parport
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -182,6 +182,6 @@ module_init(ks0108_init);
 module_exit(ks0108_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("ks0108 LCD Controller driver");
 
diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h
index 4060004968c8..6617d9c68d86 100644
--- a/include/linux/cfag12864b.h
+++ b/include/linux/cfag12864b.h
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: cfag12864b LCD driver header
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-12
  */
 
diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h
index 0738389b42b6..1a37a664f915 100644
--- a/include/linux/ks0108.h
+++ b/include/linux/ks0108.h
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: ks0108 LCD Controller driver header
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
index bfeab44f81d0..2e3bb7375c99 100644
--- a/samples/auxdisplay/cfag12864b-example.c
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: cfag12864b LCD userspace example program
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
-- 
cgit v1.2.3


From c1f26493ed7f363c63e0e9d91e50d4db26df6603 Mon Sep 17 00:00:00 2001
From: Hubert Jasudowicz <hubert.jasudowicz@gmail.com>
Date: Thu, 25 Feb 2021 17:21:03 -0800
Subject: groups: use flexible-array member in struct group_info

Replace zero-size array with flexible array member, as recommended by
the docs.

Link: https://lkml.kernel.org/r/155995eed35c3c1bdcc56e69d8997c8e4c46740a.1611620846.git.hubert.jasudowicz@gmail.com
Signed-off-by: Hubert Jasudowicz <hubert.jasudowicz@gmail.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Micah Morton <mortonm@chromium.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Thomas Cedeno <thomascedeno@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..4c6350503697 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -25,7 +25,7 @@ struct inode;
 struct group_info {
 	atomic_t	usage;
 	int		ngroups;
-	kgid_t		gid[0];
+	kgid_t		gid[];
 } __randomize_layout;
 
 /**
-- 
cgit v1.2.3


From a28a6e860c6cf231cf3c5171c75c342adcd00406 Mon Sep 17 00:00:00 2001
From: Francis Laniel <laniel_francis@privacyrequired.com>
Date: Thu, 25 Feb 2021 17:21:20 -0800
Subject: string.h: move fortified functions definitions in a dedicated header.

This patch adds fortify-string.h to contain fortified functions
definitions.  Thus, the code is more separated and compile time is
approximately 1% faster for people who do not set CONFIG_FORTIFY_SOURCE.

Link: https://lkml.kernel.org/r/20210111092141.22946-1-laniel_francis@privacyrequired.com
Link: https://lkml.kernel.org/r/20210111092141.22946-2-laniel_francis@privacyrequired.com
Signed-off-by: Francis Laniel <laniel_francis@privacyrequired.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fortify-string.h | 302 +++++++++++++++++++++++++++++++++++++++++
 include/linux/string.h         | 282 +-------------------------------------
 2 files changed, 303 insertions(+), 281 deletions(-)
 create mode 100644 include/linux/fortify-string.h

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
new file mode 100644
index 000000000000..c1be37437e77
--- /dev/null
+++ b/include/linux/fortify-string.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FORTIFY_STRING_H_
+#define _LINUX_FORTIFY_STRING_H_
+
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
+extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
+extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
+extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
+extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
+extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+#else
+#define __underlying_memchr	__builtin_memchr
+#define __underlying_memcmp	__builtin_memcmp
+#define __underlying_memcpy	__builtin_memcpy
+#define __underlying_memmove	__builtin_memmove
+#define __underlying_memset	__builtin_memset
+#define __underlying_strcat	__builtin_strcat
+#define __underlying_strcpy	__builtin_strcpy
+#define __underlying_strlen	__builtin_strlen
+#define __underlying_strncat	__builtin_strncat
+#define __underlying_strncpy	__builtin_strncpy
+#endif
+
+__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__write_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_strncpy(p, q, size);
+}
+
+__FORTIFY_INLINE char *strcat(char *p, const char *q)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+
+	if (p_size == (size_t)-1)
+		return __underlying_strcat(p, q);
+	if (strlcat(p, q, p_size) >= p_size)
+		fortify_panic(__func__);
+	return p;
+}
+
+__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
+{
+	__kernel_size_t ret;
+	size_t p_size = __builtin_object_size(p, 1);
+
+	/* Work around gcc excess stack consumption issue */
+	if (p_size == (size_t)-1 ||
+		(__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
+		return __underlying_strlen(p);
+	ret = strnlen(p, p_size);
+	if (p_size <= ret)
+		fortify_panic(__func__);
+	return ret;
+}
+
+extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
+__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
+
+	if (p_size <= ret && maxlen != ret)
+		fortify_panic(__func__);
+	return ret;
+}
+
+/* defined after fortified strlen to reuse it */
+extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
+__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
+{
+	size_t ret;
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __real_strlcpy(p, q, size);
+	ret = strlen(q);
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+
+		if (__builtin_constant_p(len) && len >= p_size)
+			__write_overflow();
+		if (len >= p_size)
+			fortify_panic(__func__);
+		__underlying_memcpy(p, q, len);
+		p[len] = '\0';
+	}
+	return ret;
+}
+
+/* defined after fortified strnlen to reuse it */
+extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
+__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size)
+{
+	size_t len;
+	/* Use string size rather than possible enclosing struct size. */
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	/* If we cannot get size of p and q default to call strscpy. */
+	if (p_size == (size_t) -1 && q_size == (size_t) -1)
+		return __real_strscpy(p, q, size);
+
+	/*
+	 * If size can be known at compile time and is greater than
+	 * p_size, generate a compile time write overflow error.
+	 */
+	if (__builtin_constant_p(size) && size > p_size)
+		__write_overflow();
+
+	/*
+	 * This call protects from read overflow, because len will default to q
+	 * length if it smaller than size.
+	 */
+	len = strnlen(q, size);
+	/*
+	 * If len equals size, we will copy only size bytes which leads to
+	 * -E2BIG being returned.
+	 * Otherwise we will copy len + 1 because of the final '\O'.
+	 */
+	len = len == size ? size : len + 1;
+
+	/*
+	 * Generate a runtime write overflow error if len is greater than
+	 * p_size.
+	 */
+	if (len > p_size)
+		fortify_panic(__func__);
+
+	/*
+	 * We can now safely call vanilla strscpy because we are protected from:
+	 * 1. Read overflow thanks to call to strnlen().
+	 * 2. Write overflow thanks to above ifs.
+	 */
+	return __real_strscpy(p, q, len);
+}
+
+/* defined after fortified strlen and strnlen to reuse them */
+__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
+{
+	size_t p_len, copy_len;
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __underlying_strncat(p, q, count);
+	p_len = strlen(p);
+	copy_len = strnlen(q, count);
+	if (p_size < p_len + copy_len + 1)
+		fortify_panic(__func__);
+	__underlying_memcpy(p + p_len, q, copy_len);
+	p[p_len + copy_len] = '\0';
+	return p;
+}
+
+__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__write_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_memset(p, c, size);
+}
+
+__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__write_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memcpy(p, q, size);
+}
+
+__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__write_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memmove(p, q, size);
+}
+
+extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
+__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_memscan(p, c, size);
+}
+
+__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__read_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memcmp(p, q, size);
+}
+
+__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_memchr(p, c, size);
+}
+
+void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
+__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_memchr_inv(p, c, size);
+}
+
+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
+__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_kmemdup(p, size, gfp);
+}
+
+/* defined after fortified strlen and memcpy to reuse them */
+__FORTIFY_INLINE char *strcpy(char *p, const char *q)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+	size_t size;
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __underlying_strcpy(p, q);
+	size = strlen(q) + 1;
+	/* test here to use the more stringent object size */
+	if (p_size < size)
+		fortify_panic(__func__);
+	memcpy(p, q, size);
+	return p;
+}
+
+/* Don't use these outside the FORITFY_SOURCE implementation */
+#undef __underlying_memchr
+#undef __underlying_memcmp
+#undef __underlying_memcpy
+#undef __underlying_memmove
+#undef __underlying_memset
+#undef __underlying_strcat
+#undef __underlying_strcpy
+#undef __underlying_strlen
+#undef __underlying_strncat
+#undef __underlying_strncpy
+
+#endif /* _LINUX_FORTIFY_STRING_H_ */
diff --git a/include/linux/string.h b/include/linux/string.h
index 4fcfb56abcf5..9521d8cab18e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -266,287 +266,7 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob
 void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
 
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
-
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
-extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
-extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
-extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
-extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
-extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
-extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
-extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
-extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
-extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
-#else
-#define __underlying_memchr	__builtin_memchr
-#define __underlying_memcmp	__builtin_memcmp
-#define __underlying_memcpy	__builtin_memcpy
-#define __underlying_memmove	__builtin_memmove
-#define __underlying_memset	__builtin_memset
-#define __underlying_strcat	__builtin_strcat
-#define __underlying_strcpy	__builtin_strcpy
-#define __underlying_strlen	__builtin_strlen
-#define __underlying_strncat	__builtin_strncat
-#define __underlying_strncpy	__builtin_strncpy
-#endif
-
-__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	if (__builtin_constant_p(size) && p_size < size)
-		__write_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_strncpy(p, q, size);
-}
-
-__FORTIFY_INLINE char *strcat(char *p, const char *q)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	if (p_size == (size_t)-1)
-		return __underlying_strcat(p, q);
-	if (strlcat(p, q, p_size) >= p_size)
-		fortify_panic(__func__);
-	return p;
-}
-
-__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
-{
-	__kernel_size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
-
-	/* Work around gcc excess stack consumption issue */
-	if (p_size == (size_t)-1 ||
-	    (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
-		return __underlying_strlen(p);
-	ret = strnlen(p, p_size);
-	if (p_size <= ret)
-		fortify_panic(__func__);
-	return ret;
-}
-
-extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
-__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
-	if (p_size <= ret && maxlen != ret)
-		fortify_panic(__func__);
-	return ret;
-}
-
-/* defined after fortified strlen to reuse it */
-extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
-__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
-{
-	size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __real_strlcpy(p, q, size);
-	ret = strlen(q);
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		if (__builtin_constant_p(len) && len >= p_size)
-			__write_overflow();
-		if (len >= p_size)
-			fortify_panic(__func__);
-		__underlying_memcpy(p, q, len);
-		p[len] = '\0';
-	}
-	return ret;
-}
-
-/* defined after fortified strnlen to reuse it */
-extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
-__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size)
-{
-	size_t len;
-	/* Use string size rather than possible enclosing struct size. */
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-
-	/* If we cannot get size of p and q default to call strscpy. */
-	if (p_size == (size_t) -1 && q_size == (size_t) -1)
-		return __real_strscpy(p, q, size);
-
-	/*
-	 * If size can be known at compile time and is greater than
-	 * p_size, generate a compile time write overflow error.
-	 */
-	if (__builtin_constant_p(size) && size > p_size)
-		__write_overflow();
-
-	/*
-	 * This call protects from read overflow, because len will default to q
-	 * length if it smaller than size.
-	 */
-	len = strnlen(q, size);
-	/*
-	 * If len equals size, we will copy only size bytes which leads to
-	 * -E2BIG being returned.
-	 * Otherwise we will copy len + 1 because of the final '\O'.
-	 */
-	len = len == size ? size : len + 1;
-
-	/*
-	 * Generate a runtime write overflow error if len is greater than
-	 * p_size.
-	 */
-	if (len > p_size)
-		fortify_panic(__func__);
-
-	/*
-	 * We can now safely call vanilla strscpy because we are protected from:
-	 * 1. Read overflow thanks to call to strnlen().
-	 * 2. Write overflow thanks to above ifs.
-	 */
-	return __real_strscpy(p, q, len);
-}
-
-/* defined after fortified strlen and strnlen to reuse them */
-__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
-{
-	size_t p_len, copy_len;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __underlying_strncat(p, q, count);
-	p_len = strlen(p);
-	copy_len = strnlen(q, count);
-	if (p_size < p_len + copy_len + 1)
-		fortify_panic(__func__);
-	__underlying_memcpy(p + p_len, q, copy_len);
-	p[p_len + copy_len] = '\0';
-	return p;
-}
-
-__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__write_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_memset(p, c, size);
-}
-
-__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__write_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memcpy(p, q, size);
-}
-
-__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__write_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memmove(p, q, size);
-}
-
-extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
-__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_memscan(p, c, size);
-}
-
-__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__read_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memcmp(p, q, size);
-}
-
-__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_memchr(p, c, size);
-}
-
-void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
-__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_memchr_inv(p, c, size);
-}
-
-extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
-__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_kmemdup(p, size, gfp);
-}
-
-/* defined after fortified strlen and memcpy to reuse them */
-__FORTIFY_INLINE char *strcpy(char *p, const char *q)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	size_t size;
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __underlying_strcpy(p, q);
-	size = strlen(q) + 1;
-	/* test here to use the more stringent object size */
-	if (p_size < size)
-		fortify_panic(__func__);
-	memcpy(p, q, size);
-	return p;
-}
-
-/* Don't use these outside the FORITFY_SOURCE implementation */
-#undef __underlying_memchr
-#undef __underlying_memcmp
-#undef __underlying_memcpy
-#undef __underlying_memmove
-#undef __underlying_memset
-#undef __underlying_strcat
-#undef __underlying_strcpy
-#undef __underlying_strlen
-#undef __underlying_strncat
-#undef __underlying_strncpy
+#include <linux/fortify-string.h>
 #endif
 
 /**
-- 
cgit v1.2.3


From e1fdc403349c64fa58f4c163f4bf9b860b4db808 Mon Sep 17 00:00:00 2001
From: Vijayanand Jitta <vjitta@codeaurora.org>
Date: Thu, 25 Feb 2021 17:21:27 -0800
Subject: lib: stackdepot: add support to disable stack depot

Add a kernel parameter stack_depot_disable to disable stack depot.  So
that stack hash table doesn't consume any memory when stack depot is
disabled.

The use case is CONFIG_PAGE_OWNER without page_owner=on.  Without this
patch, stackdepot will consume the memory for the hashtable.  By default,
it's 8M which is never trivial.

With this option, in CONFIG_PAGE_OWNER configured system, page_owner=off,
stack_depot_disable in kernel command line, we could save the wasted
memory for the hashtable.

[akpm@linux-foundation.org: fix CONFIG_STACKDEPOT=n build]

Link: https://lkml.kernel.org/r/1611749198-24316-2-git-send-email-vjitta@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yogesh Lal <ylal@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +++++
 include/linux/stackdepot.h                      |  9 +++++++
 init/main.c                                     |  2 ++
 lib/stackdepot.c                                | 32 +++++++++++++++++++++----
 4 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bab6a8b01202..04545725f187 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5182,6 +5182,12 @@
 			growing up) the main stack are reserved for no other
 			mapping. Default value is 256 pages.
 
+	stack_depot_disable= [KNL]
+			Setting this to true through kernel command line will
+			disable the stack depot thereby saving the static memory
+			consumed by the stack hash table. By default this is set
+			to false.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 24d49c732341..6bb4bc1a5f54 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -21,4 +21,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 
 unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
+#ifdef CONFIG_STACKDEPOT
+int stack_depot_init(void);
+#else
+static inline int stack_depot_init(void)
+{
+	return 0;
+}
+#endif	/* CONFIG_STACKDEPOT */
+
 #endif
diff --git a/init/main.c b/init/main.c
index 261051070e3c..3648c9f94882 100644
--- a/init/main.c
+++ b/init/main.c
@@ -97,6 +97,7 @@
 #include <linux/mem_encrypt.h>
 #include <linux/kcsan.h>
 #include <linux/init_syscalls.h>
+#include <linux/stackdepot.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -827,6 +828,7 @@ static void __init mm_init(void)
 	init_mem_debugging_and_hardening();
 	kfence_alloc_pool();
 	report_meminit();
+	stack_depot_init();
 	mem_init();
 	/* page_owner must be initialized after buddy is ready */
 	page_ext_init_flatmem_late();
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 4b9715470e87..cc21116512a7 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -31,6 +31,7 @@
 #include <linux/stackdepot.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/memblock.h>
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
@@ -145,9 +146,32 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
 #define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
 #define STACK_HASH_SEED 0x9747b28c
 
-static struct stack_record *stack_table[STACK_HASH_SIZE] = {
-	[0 ...	STACK_HASH_SIZE - 1] = NULL
-};
+static bool stack_depot_disable;
+static struct stack_record **stack_table;
+
+static int __init is_stack_depot_disabled(char *str)
+{
+	kstrtobool(str, &stack_depot_disable);
+	if (stack_depot_disable) {
+		pr_info("Stack Depot is disabled\n");
+		stack_table = NULL;
+	}
+	return 0;
+}
+early_param("stack_depot_disable", is_stack_depot_disabled);
+
+int __init stack_depot_init(void)
+{
+	if (!stack_depot_disable) {
+		size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *));
+		int i;
+
+		stack_table = memblock_alloc(size, size);
+		for (i = 0; i < STACK_HASH_SIZE;  i++)
+			stack_table[i] = NULL;
+	}
+	return 0;
+}
 
 /* Calculate hash for a stack */
 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
@@ -241,7 +265,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 	unsigned long flags;
 	u32 hash;
 
-	if (unlikely(nr_entries == 0))
+	if (unlikely(nr_entries == 0) || stack_depot_disable)
 		goto fast_exit;
 
 	hash = hash_stack(entries, nr_entries);
-- 
cgit v1.2.3


From 4945cca232ce8bc699b8743f2436af664c471b96 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 25 Feb 2021 17:21:37 -0800
Subject: include/linux/bitops.h: spelling s/synomyn/synonym/

Fix a misspelling of "synonym".

Link: https://lkml.kernel.org/r/20210108105305.2028120-1-geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a61f192c096b..a5a48303b0f1 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -214,7 +214,7 @@ static inline int get_count_order_long(unsigned long l)
  * __ffs64 - find first set bit in a 64 bit word
  * @word: The 64 bit word
  *
- * On 64 bit arches this is a synomyn for __ffs
+ * On 64 bit arches this is a synonym for __ffs
  * The result is not defined if no bits are set, so check that @word
  * is non-zero before calling this.
  */
-- 
cgit v1.2.3


From a5a673f7312253a842f3da8c60c980461cc269ec Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 25 Feb 2021 17:22:15 -0800
Subject: init: clean up early_param_on_off() macro

Use early_param() to define early_param_on_off().

Link: https://lkml.kernel.org/r/20210201041532.4025025-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Johan Hovold <johan@kernel.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nick Desaulniers <ndesaulniers@gooogle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/init.h b/include/linux/init.h
index a01f01c1a5c5..31f54de58429 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -338,14 +338,14 @@ struct obs_kernel_param {
 		var = 1;						\
 		return 0;						\
 	}								\
-	__setup_param(str_on, parse_##var##_on, parse_##var##_on, 1);	\
+	early_param(str_on, parse_##var##_on);				\
 									\
 	static int __init parse_##var##_off(char *arg)			\
 	{								\
 		var = 0;						\
 		return 0;						\
 	}								\
-	__setup_param(str_off, parse_##var##_off, parse_##var##_off, 1)
+	early_param(str_off, parse_##var##_off)
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
-- 
cgit v1.2.3


From d54ce6158e354f5358a547b96299ecd7f3725393 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Thu, 25 Feb 2021 17:22:38 -0800
Subject: kgdb: fix to kill breakpoints on initmem after boot

Currently breakpoints in kernel .init.text section are not handled
correctly while allowing to remove them even after corresponding pages
have been freed.

Fix it via killing .init.text section breakpoints just prior to initmem
pages being freed.

Doug: "HW breakpoints aren't handled by this patch but it's probably
not such a big deal".

Link: https://lkml.kernel.org/r/20210224081652.587785-1-sumit.garg@linaro.org
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Suggested-by: Doug Anderson <dianders@chromium.org>
Acked-by: Doug Anderson <dianders@chromium.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Tested-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kgdb.h      |  2 ++
 init/main.c               |  1 +
 kernel/debug/debug_core.c | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 0444b44bd156..392a3670944c 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -359,9 +359,11 @@ extern atomic_t			kgdb_active;
 extern bool dbg_is_early;
 extern void __init dbg_late_init(void);
 extern void kgdb_panic(const char *msg);
+extern void kgdb_free_init_mem(void);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
 #define dbg_late_init()
 static inline void kgdb_panic(const char *msg) {}
+static inline void kgdb_free_init_mem(void) { }
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/init/main.c b/init/main.c
index 3648c9f94882..53b278845b88 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1426,6 +1426,7 @@ static int __ref kernel_init(void *unused)
 	async_synchronize_full();
 	kprobe_free_init_mem();
 	ftrace_free_init_mem();
+	kgdb_free_init_mem();
 	free_initmem();
 	mark_readonly();
 
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b636d517c02c..4708aec492df 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -455,6 +455,17 @@ setundefined:
 	return 0;
 }
 
+void kgdb_free_init_mem(void)
+{
+	int i;
+
+	/* Clear init memory breakpoints. */
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+			kgdb_break[i].state = BP_UNDEFINED;
+	}
+}
+
 #ifdef CONFIG_KGDB_KDB
 void kdb_dump_stack_on_cpu(int cpu)
 {
-- 
cgit v1.2.3


From f685a533a7fab35c5d069dcd663f59c8e4171a75 Mon Sep 17 00:00:00 2001
From: Huang Pei <huangpei@loongson.cn>
Date: Thu, 25 Feb 2021 17:22:49 -0800
Subject: MIPS: make userspace mapping young by default

MIPS page fault path(except huge page) takes 3 exceptions (1 TLB Miss + 2
TLB Invalid), butthe second TLB Invalid exception is just triggered by
__update_tlb from do_page_fault writing tlb without _PAGE_VALID set.  With
this patch, user space mapping prot is made young by default (with both
_PAGE_VALID and _PAGE_YOUNG set), and it only take 1 TLB Miss + 1 TLB
Invalid exception

Remove pte_sw_mkyoung without polluting MM code and make page fault delay
of MIPS on par with other architecture

Link: https://lkml.kernel.org/r/20210204013942.8398-1-huangpei@loongson.cn
Signed-off-by: Huang Pei <huangpei@loongson.cn>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: <huangpei@loongson.cn>
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: <ambrosehua@gmail.com>
Cc: Bibo Mao <maobibo@loongson.cn>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Paul Burton <paulburton@kernel.org>
Cc: Li Xuefeng <lixuefeng@loongson.cn>
Cc: Yang Tiezhu <yangtiezhu@loongson.cn>
Cc: Gao Juxin <gaojuxin@loongson.cn>
Cc: Fuxin Zhang <zhangfx@lemote.com>
Cc: Huacai Chen <chenhc@lemote.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/cache.c    | 30 ++++++++++++++++--------------
 include/linux/pgtable.h |  8 --------
 mm/memory.c             |  4 ----
 3 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 1754498b0717..7719d632df8d 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -157,29 +157,31 @@ unsigned long _page_cachable_default;
 EXPORT_SYMBOL(_page_cachable_default);
 
 #define PM(p)	__pgprot(_page_cachable_default | (p))
+#define PVA(p)	PM(_PAGE_VALID | _PAGE_ACCESSED | (p))
 
 static inline void setup_protection_map(void)
 {
 	protection_map[0]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[1]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[2]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[3]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[4]  = PM(_PAGE_PRESENT);
-	protection_map[5]  = PM(_PAGE_PRESENT);
-	protection_map[6]  = PM(_PAGE_PRESENT);
-	protection_map[7]  = PM(_PAGE_PRESENT);
+	protection_map[1]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[2]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
+	protection_map[3]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[4]  = PVA(_PAGE_PRESENT);
+	protection_map[5]  = PVA(_PAGE_PRESENT);
+	protection_map[6]  = PVA(_PAGE_PRESENT);
+	protection_map[7]  = PVA(_PAGE_PRESENT);
 
 	protection_map[8]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[9]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
+	protection_map[9]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
 				_PAGE_NO_READ);
-	protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
-	protection_map[12] = PM(_PAGE_PRESENT);
-	protection_map[13] = PM(_PAGE_PRESENT);
-	protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE);
-	protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE);
+	protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
+	protection_map[12] = PVA(_PAGE_PRESENT);
+	protection_map[13] = PVA(_PAGE_PRESENT);
+	protection_map[14] = PVA(_PAGE_PRESENT);
+	protection_map[15] = PVA(_PAGE_PRESENT);
 }
 
+#undef _PVA
 #undef PM
 
 void cpu_cache_init(void)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 36eb748f3c97..cdfc4e9f253e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -432,14 +432,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
  * where software maintains page access bit.
  */
-#ifndef pte_sw_mkyoung
-static inline pte_t pte_sw_mkyoung(pte_t pte)
-{
-	return pte;
-}
-#define pte_sw_mkyoung	pte_sw_mkyoung
-#endif
-
 #ifndef pte_savedwrite
 #define pte_savedwrite pte_write
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 784249f3307b..c8e357627318 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2902,7 +2902,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = pte_sw_mkyoung(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 		/*
@@ -3560,7 +3559,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	__SetPageUptodate(page);
 
 	entry = mk_pte(page, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
@@ -3745,8 +3743,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 
 	if (prefault && arch_wants_old_prefaulted_pte())
 		entry = pte_mkold(entry);
-	else
-		entry = pte_sw_mkyoung(entry);
 
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-- 
cgit v1.2.3


From 5f7136db82996089cdfb2939c7664b29e9da141d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Jan 2021 04:38:57 +0000
Subject: block: Add bio_max_segs

It's often inconvenient to use BIO_MAX_PAGES due to min() requiring the
sign to be the same.  Introduce bio_max_segs() and change BIO_MAX_PAGES to
be unsigned to make it easier for the users.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                     |  4 +---
 drivers/block/xen-blkback/blkback.c |  4 +---
 drivers/md/dm-io.c                  |  4 ++--
 drivers/md/dm-log-writes.c          | 10 +++++-----
 drivers/nvme/target/io-cmd-bdev.c   |  8 ++++----
 drivers/nvme/target/passthru.c      |  4 ++--
 drivers/target/target_core_iblock.c |  9 +++------
 drivers/target/target_core_pscsi.c  |  2 +-
 fs/block_dev.c                      | 10 +++++-----
 fs/direct-io.c                      |  2 +-
 fs/erofs/data.c                     |  4 +---
 fs/ext4/readpage.c                  |  3 +--
 fs/f2fs/data.c                      |  3 +--
 fs/f2fs/node.c                      |  2 +-
 fs/iomap/buffered-io.c              |  4 ++--
 fs/mpage.c                          |  4 +---
 fs/nfs/blocklayout/blocklayout.c    |  6 +++---
 fs/xfs/xfs_bio_io.c                 |  2 +-
 fs/xfs/xfs_buf.c                    |  4 ++--
 include/linux/bio.h                 |  7 ++++++-
 20 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/block/blk-map.c b/block/blk-map.c
index 21630dccac62..369e204d14d0 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -150,9 +150,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
 	bmd->is_our_pages = !map_data;
 	bmd->is_null_mapped = (map_data && map_data->null_mapped);
 
-	nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	if (nr_pages > BIO_MAX_PAGES)
-		nr_pages = BIO_MAX_PAGES;
+	nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE));
 
 	ret = -ENOMEM;
 	bio = bio_kmalloc(gfp_mask, nr_pages);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index da16121140ca..1cdf09ff67b6 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1326,9 +1326,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				     pages[i]->page,
 				     seg[i].nsec << 9,
 				     seg[i].offset) == 0)) {
-
-			int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
-			bio = bio_alloc(GFP_KERNEL, nr_iovecs);
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4312007d2d34..2d3cda0acacb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -341,8 +341,8 @@ static void do_region(int op, int op_flags, unsigned region,
 			num_bvecs = 1;
 			break;
 		default:
-			num_bvecs = min_t(int, BIO_MAX_PAGES,
-					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+			num_bvecs = bio_max_segs(dm_sector_div_up(remaining,
+						(PAGE_SIZE >> SECTOR_SHIFT)));
 		}
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index e3d35c6c9f71..57882654ffee 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -264,15 +264,14 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
 			     size_t entrylen, void *data, size_t datalen,
 			     sector_t sector)
 {
-	int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
+	int bio_pages, pg_datalen, pg_sectorlen, i;
 	struct page *page;
 	struct bio *bio;
 	size_t ret;
 	void *ptr;
 
 	while (datalen) {
-		num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
-		bio_pages = min(num_pages, BIO_MAX_PAGES);
+		bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE));
 
 		atomic_inc(&lc->io_blocks);
 
@@ -364,7 +363,7 @@ static int log_one_block(struct log_writes_c *lc,
 		goto out;
 
 	atomic_inc(&lc->io_blocks);
-	bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
+	bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
 	if (!bio) {
 		DMERR("Couldn't alloc log bio");
 		goto error;
@@ -386,7 +385,8 @@ static int log_one_block(struct log_writes_c *lc,
 		if (ret != block->vecs[i].bv_len) {
 			atomic_inc(&lc->io_blocks);
 			submit_bio(bio);
-			bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL,
+					bio_max_segs(block->vec_cnt - i));
 			if (!bio) {
 				DMERR("Couldn't alloc log bio");
 				goto error;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 3d9a5d3ed9cd..9a8b3726a37c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -185,7 +185,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 	}
 
 	bip = bio_integrity_alloc(bio, GFP_NOIO,
-		min_t(unsigned int, req->metadata_sg_cnt, BIO_MAX_PAGES));
+					bio_max_segs(req->metadata_sg_cnt));
 	if (IS_ERR(bip)) {
 		pr_err("Unable to allocate bio_integrity_payload\n");
 		return PTR_ERR(bip);
@@ -225,7 +225,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 
 static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 {
-	int sg_cnt = req->sg_cnt;
+	unsigned int sg_cnt = req->sg_cnt;
 	struct bio *bio;
 	struct scatterlist *sg;
 	struct blk_plug plug;
@@ -262,7 +262,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 		bio = &req->b.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	} else {
-		bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+		bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
 	}
 	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sector;
@@ -289,7 +289,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 				}
 			}
 
-			bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
 			bio_set_dev(bio, req->ns->bdev);
 			bio->bi_iter.bi_sector = sector;
 			bio->bi_opf = op;
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index f50c7b2bf21c..26c587ccd152 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
 	struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
 	u16 status = NVME_SC_SUCCESS;
 	struct nvme_id_ctrl *id;
-	int max_hw_sectors;
+	unsigned int max_hw_sectors;
 	int page_shift;
 
 	id = kzalloc(sizeof(*id), GFP_KERNEL);
@@ -198,7 +198,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 		bio = &req->p.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	} else {
-		bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES));
+		bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt));
 		bio->bi_end_io = bio_put;
 	}
 	bio->bi_opf = req_op(rq);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 8ed93fd205c7..ee3d52061281 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -315,10 +315,8 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
 	 * Only allocate as many vector entries as the bio code allows us to,
 	 * we'll loop later on until we have handled the whole request.
 	 */
-	if (sg_num > BIO_MAX_PAGES)
-		sg_num = BIO_MAX_PAGES;
-
-	bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set);
+	bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num),
+				&ib_dev->ibd_bio_set);
 	if (!bio) {
 		pr_err("Unable to allocate memory for bio\n");
 		return NULL;
@@ -638,8 +636,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
 		return -ENODEV;
 	}
 
-	bip = bio_integrity_alloc(bio, GFP_NOIO,
-			min_t(unsigned int, cmd->t_prot_nents, BIO_MAX_PAGES));
+	bip = bio_integrity_alloc(bio, GFP_NOIO, bio_max_segs(cmd->t_prot_nents));
 	if (IS_ERR(bip)) {
 		pr_err("Unable to allocate bio_integrity_payload\n");
 		return PTR_ERR(bip);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 33770e5808ce..3cbc074992bc 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -881,7 +881,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 
 			if (!bio) {
 new_bio:
-				nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages);
+				nr_vecs = bio_max_segs(nr_pages);
 				nr_pages -= nr_vecs;
 				/*
 				 * Calls bio_kmalloc() and sets bio->bi_end_io()
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec26179c8062..0f95ff343d6b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 
 static ssize_t
 __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
-		int nr_pages)
+		unsigned int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
@@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 	}
 }
 
-static ssize_t
-__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+		unsigned int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
@@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
-	int nr_pages;
+	unsigned int nr_pages;
 
 	if (!iov_iter_count(iter))
 		return 0;
@@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
-	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
 static __init int blkdev_init(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aa1083ecd623..c9639b4166c2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
-	nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
+	nr_pages = bio_max_segs(sdio->pages_in_io);
 	BUG_ON(nr_pages <= 0);
 	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ea4f693bee22..f88851c5c250 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -215,10 +215,8 @@ submit_bio_retry:
 		/* max # of continuous pages */
 		if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
 			nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
-		if (nblocks > BIO_MAX_PAGES)
-			nblocks = BIO_MAX_PAGES;
 
-		bio = bio_alloc(GFP_NOIO, nblocks);
+		bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks));
 
 		bio->bi_end_io = erofs_readendio;
 		bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f014c5e473a9..3db923403505 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
 			 * bio_alloc will _always_ be able to allocate a bio if
 			 * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
 			 */
-			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
 			fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
 						  GFP_KERNEL);
 			ext4_set_bio_post_read_ctx(bio, inode, page->index);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b9721c8f116c..7c95818639a6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	unsigned int post_read_steps = 0;
 
 	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
-			       min_t(int, nr_pages, BIO_MAX_PAGES),
-			       &f2fs_bioset);
+			       bio_max_segs(nr_pages), &f2fs_bioset);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a8a0fb890e8d..4b0e2e3c2c88 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
 	sum_entry = &sum->entries[0];
 
 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-		nrpages = min(last_offset - i, BIO_MAX_PAGES);
+		nrpages = bio_max_segs(last_offset - i);
 
 		/* readahead node pages */
 		f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..0d9d1a6a947e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	if (!is_contig || bio_full(ctx->bio, plen)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		gfp_t orig_gfp = gfp;
-		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
 
 		if (ctx->bio)
 			submit_bio(ctx->bio);
 
 		if (ctx->rac) /* same as readahead_gfp_mask */
 			gfp |= __GFP_NORETRY | __GFP_NOWARN;
-		ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+		ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
 		/*
 		 * If the bio_alloc fails, try it again for a single page to
 		 * avoid having to deal with partial page reads.  This emulates
diff --git a/fs/mpage.c b/fs/mpage.c
index 830e6cc2a9e7..961234d68779 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -304,9 +304,7 @@ alloc_new:
 				goto out;
 		}
 		args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-					min_t(int, args->nr_pages,
-					      BIO_MAX_PAGES),
-					gfp);
+					bio_max_segs(args->nr_pages), gfp);
 		if (args->bio == NULL)
 			goto confused;
 	}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1a96ce28efb0..fe860c538747 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio)
 	return NULL;
 }
 
-static struct bio *
-bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+static struct bio *bl_alloc_init_bio(unsigned int npg,
+		struct block_device *bdev, sector_t disk_sector,
 		bio_end_io_t end_io, struct parallel_io *par)
 {
 	struct bio *bio;
 
-	npg = min(npg, BIO_MAX_PAGES);
+	npg = bio_max_segs(npg);
 	bio = bio_alloc(GFP_NOIO, npg);
 	if (bio) {
 		bio->bi_iter.bi_sector = disk_sector;
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..17f36db2f792 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
 
 static inline unsigned int bio_max_vecs(unsigned int count)
 {
-	return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+	return bio_max_segs(howmany(count, PAGE_SIZE));
 }
 
 int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f6e5235df7c9..37a1d12762d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map(
 	int		op)
 {
 	int		page_index;
-	int		total_nr_pages = bp->b_page_count;
+	unsigned int	total_nr_pages = bp->b_page_count;
 	int		nr_pages;
 	struct bio	*bio;
 	sector_t	sector =  bp->b_maps[map].bm_bn;
@@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map(
 
 next_chunk:
 	atomic_inc(&bp->b_io_remaining);
-	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+	nr_pages = bio_max_segs(total_nr_pages);
 
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	bio_set_dev(bio, bp->b_target->bt_bdev);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5b468f2242ff..983ed2fe7c85 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -20,7 +20,12 @@
 #define BIO_BUG_ON
 #endif
 
-#define BIO_MAX_PAGES		256
+#define BIO_MAX_PAGES		256U
+
+static inline unsigned int bio_max_segs(unsigned int nr_segs)
+{
+	return min(nr_segs, BIO_MAX_PAGES);
+}
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
-- 
cgit v1.2.3


From 5218e12e9f3a324f41c05da4874d76d7ea3677cb Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Mon, 1 Mar 2021 17:23:25 +0100
Subject: block: Drop leftover references to RQF_SORTED

Commit a1ce35fa49852db60fc6e268038530be533c5b15 ("block: remove dead
elevator code") removed all users of RQF_SORTED. However it is still
defined, and there is one reference left to it (which in effect is
dead code). Clear it all up.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 1 -
 block/blk-mq-sched.c   | 6 +-----
 include/linux/blkdev.h | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4de03da9a624..9ebb344e2585 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -292,7 +292,6 @@ static const char *const cmd_flag_name[] = {
 
 #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
 static const char *const rqf_name[] = {
-	RQF_NAME(SORTED),
 	RQF_NAME(STARTED),
 	RQF_NAME(SOFTBARRIER),
 	RQF_NAME(FLUSH_SEQ),
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ddb65e9e6fd9..e1e997af89a0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -385,7 +385,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 
 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
-				       bool has_sched,
 				       struct request *rq)
 {
 	/*
@@ -402,9 +401,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
 		return true;
 
-	if (has_sched)
-		rq->rq_flags |= RQF_SORTED;
-
 	return false;
 }
 
@@ -418,7 +414,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 
 	WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 
-	if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
+	if (blk_mq_sched_bypass_insert(hctx, rq)) {
 		/*
 		 * Firstly normal IO request is inserted to scheduler queue or
 		 * sw queue, meantime we add flush request to dispatch queue(
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c032cfe133c7..bc6bc8383b43 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -65,8 +65,6 @@ typedef void (rq_end_io_fn)(struct request *, blk_status_t);
  * request flags */
 typedef __u32 __bitwise req_flags_t;
 
-/* elevator knows about this request */
-#define RQF_SORTED		((__force req_flags_t)(1 << 0))
 /* drive already may have started this one */
 #define RQF_STARTED		((__force req_flags_t)(1 << 1))
 /* may not be passed by ioscheduler */
-- 
cgit v1.2.3


From a864e8f159b13babf552aff14a5fbe11abc017e4 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 1 Mar 2021 18:01:46 -0600
Subject: ALSA: hda: intel-nhlt: verify config type

Multiple bug reports report issues with the SOF and SST drivers when
dealing with single microphone cases.

We currently read the DMIC array information unconditionally but we
don't check that the configuration type is actually a mic array.

When the DMIC link does not rely on a mic array configuration, the
recommendation is to check the format information to infer the maximum
number of channels, and map this to the number of microphones.

This leaves a potential for a mismatch between actual microphones
available in hardware and what the ACPI table contains, but we have no
other source of information.

Note that single microphone configurations can alternatively be
handled with a 'mic array' configuration along with a 'vendor-defined'
geometry.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201251
BugLink: https://github.com/thesofproject/linux/issues/2725
Fixes: 7a33ea70e1868 ('ALSA: hda: intel-nhlt: handle NHLT VENDOR_DEFINED DMIC geometry')
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Link: https://lore.kernel.org/r/20210302000146.1177770-1-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/intel-nhlt.h |  5 +++++
 sound/hda/intel-nhlt.c     | 54 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/sound/intel-nhlt.h b/include/sound/intel-nhlt.h
index 743c2f442280..d0574805865f 100644
--- a/include/sound/intel-nhlt.h
+++ b/include/sound/intel-nhlt.h
@@ -112,6 +112,11 @@ struct nhlt_vendor_dmic_array_config {
 	/* TODO add vendor mic config */
 } __packed;
 
+enum {
+	NHLT_CONFIG_TYPE_GENERIC = 0,
+	NHLT_CONFIG_TYPE_MIC_ARRAY = 1
+};
+
 enum {
 	NHLT_MIC_ARRAY_2CH_SMALL = 0xa,
 	NHLT_MIC_ARRAY_2CH_BIG = 0xb,
diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c
index 059aaf04f536..d053beccfaec 100644
--- a/sound/hda/intel-nhlt.c
+++ b/sound/hda/intel-nhlt.c
@@ -31,18 +31,44 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt)
 	struct nhlt_endpoint *epnt;
 	struct nhlt_dmic_array_config *cfg;
 	struct nhlt_vendor_dmic_array_config *cfg_vendor;
+	struct nhlt_fmt *fmt_configs;
 	unsigned int dmic_geo = 0;
-	u8 j;
+	u16 max_ch = 0;
+	u8 i, j;
 
 	if (!nhlt)
 		return 0;
 
-	epnt = (struct nhlt_endpoint *)nhlt->desc;
+	for (j = 0, epnt = nhlt->desc; j < nhlt->endpoint_count; j++,
+	     epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length)) {
 
-	for (j = 0; j < nhlt->endpoint_count; j++) {
-		if (epnt->linktype == NHLT_LINK_DMIC) {
-			cfg = (struct nhlt_dmic_array_config  *)
-					(epnt->config.caps);
+		if (epnt->linktype != NHLT_LINK_DMIC)
+			continue;
+
+		cfg = (struct nhlt_dmic_array_config  *)(epnt->config.caps);
+		fmt_configs = (struct nhlt_fmt *)(epnt->config.caps + epnt->config.size);
+
+		/* find max number of channels based on format_configuration */
+		if (fmt_configs->fmt_count) {
+			dev_dbg(dev, "%s: found %d format definitions\n",
+				__func__, fmt_configs->fmt_count);
+
+			for (i = 0; i < fmt_configs->fmt_count; i++) {
+				struct wav_fmt_ext *fmt_ext;
+
+				fmt_ext = &fmt_configs->fmt_config[i].fmt_ext;
+
+				if (fmt_ext->fmt.channels > max_ch)
+					max_ch = fmt_ext->fmt.channels;
+			}
+			dev_dbg(dev, "%s: max channels found %d\n", __func__, max_ch);
+		} else {
+			dev_dbg(dev, "%s: No format information found\n", __func__);
+		}
+
+		if (cfg->device_config.config_type != NHLT_CONFIG_TYPE_MIC_ARRAY) {
+			dmic_geo = max_ch;
+		} else {
 			switch (cfg->array_type) {
 			case NHLT_MIC_ARRAY_2CH_SMALL:
 			case NHLT_MIC_ARRAY_2CH_BIG:
@@ -59,13 +85,23 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt)
 				dmic_geo = cfg_vendor->nb_mics;
 				break;
 			default:
-				dev_warn(dev, "undefined DMIC array_type 0x%0x\n",
-					 cfg->array_type);
+				dev_warn(dev, "%s: undefined DMIC array_type 0x%0x\n",
+					 __func__, cfg->array_type);
+			}
+
+			if (dmic_geo > 0) {
+				dev_dbg(dev, "%s: Array with %d dmics\n", __func__, dmic_geo);
+			}
+			if (max_ch > dmic_geo) {
+				dev_dbg(dev, "%s: max channels %d exceed dmic number %d\n",
+					__func__, max_ch, dmic_geo);
 			}
 		}
-		epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length);
 	}
 
+	dev_dbg(dev, "%s: dmic number %d max_ch %d\n",
+		__func__, dmic_geo, max_ch);
+
 	return dmic_geo;
 }
 EXPORT_SYMBOL_GPL(intel_nhlt_get_dmic_geo);
-- 
cgit v1.2.3


From c7929b15b6e926c7150d9ec64844aceecf8a7a4a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 1 Mar 2021 18:31:19 -0600
Subject: ASoC: soc-acpi: allow for partial match in parent name

To change the module dependencies and simplify Kconfigs, we need to
introduce new driver names (sof-audio-acpi-intel-byt and
sof-audio-acpi-intel-bdw), and move from an exact string match to a
partial one.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Bard Liao <bard.liao@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210302003125.1178419-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/soc-acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 9a43c44dcbbb..c45075024c30 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -174,7 +174,7 @@ struct snd_soc_acpi_codecs {
 static inline bool snd_soc_acpi_sof_parent(struct device *dev)
 {
 	return dev->parent && dev->parent->driver && dev->parent->driver->name &&
-		!strcmp(dev->parent->driver->name, "sof-audio-acpi");
+		!strncmp(dev->parent->driver->name, "sof-audio-acpi", strlen("sof-audio-acpi"));
 }
 
 #endif
-- 
cgit v1.2.3


From 08c2a4bc9f2acaefbd0158866db5cb3238a68674 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 1 Mar 2021 18:31:24 -0600
Subject: ALSA: hda: move Intel SoundWire ACPI scan to dedicated module

The ACPI scan capabilities is called from the intel-dspconfig as well
as the SOF/HDaudio drivers. This creates dependencies and randconfig issues
when HDaudio and SOF/SoundWire are not all configured as modules.

To simplify Kconfig dependencies between HDAudio, SoundWire, SOF and
intel-dspconfig, move the ACPI scan helpers to a dedicated
module. This follows the same idea as NHLT helpers which are already
handled as a dedicated module.

The only functional change is that the kernel parameter to filter
links is now handled by a different module, but that was only provided
for developers needing work-arounds for early BIOS releases.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Bard Liao <bard.liao@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210302003125.1178419-7-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/soundwire/intel.h           |   2 -
 drivers/soundwire/intel_init.c      | 158 --------------------------------
 include/linux/soundwire/sdw_intel.h |   2 +
 sound/hda/Kconfig                   |   4 +
 sound/hda/Makefile                  |   3 +
 sound/hda/intel-dsp-config.c        |   2 +-
 sound/hda/intel-sdw-acpi.c          | 174 ++++++++++++++++++++++++++++++++++++
 sound/soc/sof/intel/Kconfig         |   1 +
 sound/soc/sof/intel/hda.c           |   1 +
 9 files changed, 186 insertions(+), 161 deletions(-)
 create mode 100644 sound/hda/intel-sdw-acpi.c

(limited to 'include')

diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index 76820d0b9deb..06bac8ba14e9 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -48,8 +48,6 @@ struct sdw_intel {
 #endif
 };
 
-#define SDW_INTEL_QUIRK_MASK_BUS_DISABLE      BIT(1)
-
 int intel_master_startup(struct platform_device *pdev);
 int intel_master_process_wakeen_event(struct platform_device *pdev);
 
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index bc8520eb385e..05b726cdfebc 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -18,42 +18,12 @@
 #include "cadence_master.h"
 #include "intel.h"
 
-#define SDW_LINK_TYPE		4 /* from Intel ACPI documentation */
-#define SDW_MAX_LINKS		4
 #define SDW_SHIM_LCAP		0x0
 #define SDW_SHIM_BASE		0x2C000
 #define SDW_ALH_BASE		0x2C800
 #define SDW_LINK_BASE		0x30000
 #define SDW_LINK_SIZE		0x10000
 
-static int ctrl_link_mask;
-module_param_named(sdw_link_mask, ctrl_link_mask, int, 0444);
-MODULE_PARM_DESC(sdw_link_mask, "Intel link mask (one bit per link)");
-
-static bool is_link_enabled(struct fwnode_handle *fw_node, int i)
-{
-	struct fwnode_handle *link;
-	char name[32];
-	u32 quirk_mask = 0;
-
-	/* Find master handle */
-	snprintf(name, sizeof(name),
-		 "mipi-sdw-link-%d-subproperties", i);
-
-	link = fwnode_get_named_child_node(fw_node, name);
-	if (!link)
-		return false;
-
-	fwnode_property_read_u32(link,
-				 "intel-quirk-mask",
-				 &quirk_mask);
-
-	if (quirk_mask & SDW_INTEL_QUIRK_MASK_BUS_DISABLE)
-		return false;
-
-	return true;
-}
-
 static int sdw_intel_cleanup(struct sdw_intel_ctx *ctx)
 {
 	struct sdw_intel_link_res *link = ctx->links;
@@ -81,74 +51,6 @@ static int sdw_intel_cleanup(struct sdw_intel_ctx *ctx)
 	return 0;
 }
 
-static int
-sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
-{
-	struct acpi_device *adev;
-	int ret, i;
-	u8 count;
-
-	if (acpi_bus_get_device(info->handle, &adev))
-		return -EINVAL;
-
-	/* Found controller, find links supported */
-	count = 0;
-	ret = fwnode_property_read_u8_array(acpi_fwnode_handle(adev),
-					    "mipi-sdw-master-count", &count, 1);
-
-	/*
-	 * In theory we could check the number of links supported in
-	 * hardware, but in that step we cannot assume SoundWire IP is
-	 * powered.
-	 *
-	 * In addition, if the BIOS doesn't even provide this
-	 * 'master-count' property then all the inits based on link
-	 * masks will fail as well.
-	 *
-	 * We will check the hardware capabilities in the startup() step
-	 */
-
-	if (ret) {
-		dev_err(&adev->dev,
-			"Failed to read mipi-sdw-master-count: %d\n", ret);
-		return -EINVAL;
-	}
-
-	/* Check count is within bounds */
-	if (count > SDW_MAX_LINKS) {
-		dev_err(&adev->dev, "Link count %d exceeds max %d\n",
-			count, SDW_MAX_LINKS);
-		return -EINVAL;
-	}
-
-	if (!count) {
-		dev_warn(&adev->dev, "No SoundWire links detected\n");
-		return -EINVAL;
-	}
-	dev_dbg(&adev->dev, "ACPI reports %d SDW Link devices\n", count);
-
-	info->count = count;
-	info->link_mask = 0;
-
-	for (i = 0; i < count; i++) {
-		if (ctrl_link_mask && !(ctrl_link_mask & BIT(i))) {
-			dev_dbg(&adev->dev,
-				"Link %d masked, will not be enabled\n", i);
-			continue;
-		}
-
-		if (!is_link_enabled(acpi_fwnode_handle(adev), i)) {
-			dev_dbg(&adev->dev,
-				"Link %d not selected in firmware\n", i);
-			continue;
-		}
-
-		info->link_mask |= BIT(i);
-	}
-
-	return 0;
-}
-
 #define HDA_DSP_REG_ADSPIC2             (0x10)
 #define HDA_DSP_REG_ADSPIS2             (0x14)
 #define HDA_DSP_REG_ADSPIC2_SNDW        BIT(5)
@@ -357,66 +259,6 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
 	return 0;
 }
 
-static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level,
-				     void *cdata, void **return_value)
-{
-	struct sdw_intel_acpi_info *info = cdata;
-	struct acpi_device *adev;
-	acpi_status status;
-	u64 adr;
-
-	status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr);
-	if (ACPI_FAILURE(status))
-		return AE_OK; /* keep going */
-
-	if (acpi_bus_get_device(handle, &adev)) {
-		pr_err("%s: Couldn't find ACPI handle\n", __func__);
-		return AE_NOT_FOUND;
-	}
-
-	info->handle = handle;
-
-	/*
-	 * On some Intel platforms, multiple children of the HDAS
-	 * device can be found, but only one of them is the SoundWire
-	 * controller. The SNDW device is always exposed with
-	 * Name(_ADR, 0x40000000), with bits 31..28 representing the
-	 * SoundWire link so filter accordingly
-	 */
-	if (FIELD_GET(GENMASK(31, 28), adr) != SDW_LINK_TYPE)
-		return AE_OK; /* keep going */
-
-	/* device found, stop namespace walk */
-	return AE_CTRL_TERMINATE;
-}
-
-/**
- * sdw_intel_acpi_scan() - SoundWire Intel init routine
- * @parent_handle: ACPI parent handle
- * @info: description of what firmware/DSDT tables expose
- *
- * This scans the namespace and queries firmware to figure out which
- * links to enable. A follow-up use of sdw_intel_probe() and
- * sdw_intel_startup() is required for creation of devices and bus
- * startup
- */
-int sdw_intel_acpi_scan(acpi_handle *parent_handle,
-			struct sdw_intel_acpi_info *info)
-{
-	acpi_status status;
-
-	info->handle = NULL;
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
-				     parent_handle, 1,
-				     sdw_intel_acpi_cb,
-				     NULL, info, NULL);
-	if (ACPI_FAILURE(status) || info->handle == NULL)
-		return -ENODEV;
-
-	return sdw_intel_scan_controller(info);
-}
-EXPORT_SYMBOL_NS(sdw_intel_acpi_scan, SOUNDWIRE_INTEL_INIT);
-
 /**
  * sdw_intel_probe() - SoundWire Intel probe routine
  * @res: resource data
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 120ffddc03d2..3a5446ac014a 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -187,4 +187,6 @@ void sdw_intel_enable_irq(void __iomem *mmio_base, bool enable);
 
 irqreturn_t sdw_intel_thread(int irq, void *dev_id);
 
+#define SDW_INTEL_QUIRK_MASK_BUS_DISABLE      BIT(1)
+
 #endif
diff --git a/sound/hda/Kconfig b/sound/hda/Kconfig
index 9ed5cfa3c18c..57595f1552c9 100644
--- a/sound/hda/Kconfig
+++ b/sound/hda/Kconfig
@@ -44,9 +44,13 @@ config SND_INTEL_NHLT
 config SND_INTEL_DSP_CONFIG
 	tristate
 	select SND_INTEL_NHLT if ACPI
+	select SND_INTEL_SOUNDWIRE_ACPI if ACPI
 	# this config should be selected only for Intel DSP platforms.
 	# A fallback is provided so that the code compiles in all cases.
 
+config SND_INTEL_SOUNDWIRE_ACPI
+	tristate
+
 config SND_INTEL_BYT_PREFER_SOF
 	bool "Prefer SOF driver over SST on BY/CHT platforms"
 	depends on SND_SST_ATOM_HIFI2_PLATFORM_ACPI && SND_SOC_SOF_BAYTRAIL
diff --git a/sound/hda/Makefile b/sound/hda/Makefile
index 601e617918b8..78f487a635f8 100644
--- a/sound/hda/Makefile
+++ b/sound/hda/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_SND_HDA_EXT_CORE) += ext/
 snd-intel-dspcfg-objs := intel-dsp-config.o
 snd-intel-dspcfg-$(CONFIG_SND_INTEL_NHLT) += intel-nhlt.o
 obj-$(CONFIG_SND_INTEL_DSP_CONFIG) += snd-intel-dspcfg.o
+
+snd-intel-sdw-acpi-objs := intel-sdw-acpi.o
+obj-$(CONFIG_SND_INTEL_SOUNDWIRE_ACPI) += snd-intel-sdw-acpi.o
diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c
index d1eb9d34993a..ab5ff7867eb9 100644
--- a/sound/hda/intel-dsp-config.c
+++ b/sound/hda/intel-dsp-config.c
@@ -557,4 +557,4 @@ EXPORT_SYMBOL_GPL(snd_intel_acpi_dsp_driver_probe);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Intel DSP config driver");
-MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
+MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c
new file mode 100644
index 000000000000..6359936a1503
--- /dev/null
+++ b/sound/hda/intel-sdw-acpi.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2015-2021 Intel Corporation.
+
+/*
+ * SDW Intel ACPI scan helpers
+ */
+
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/soundwire/sdw_intel.h>
+#include <linux/string.h>
+
+#define SDW_LINK_TYPE		4 /* from Intel ACPI documentation */
+#define SDW_MAX_LINKS		4
+
+static int ctrl_link_mask;
+module_param_named(sdw_link_mask, ctrl_link_mask, int, 0444);
+MODULE_PARM_DESC(sdw_link_mask, "Intel link mask (one bit per link)");
+
+static bool is_link_enabled(struct fwnode_handle *fw_node, int i)
+{
+	struct fwnode_handle *link;
+	char name[32];
+	u32 quirk_mask = 0;
+
+	/* Find master handle */
+	snprintf(name, sizeof(name),
+		 "mipi-sdw-link-%d-subproperties", i);
+
+	link = fwnode_get_named_child_node(fw_node, name);
+	if (!link)
+		return false;
+
+	fwnode_property_read_u32(link,
+				 "intel-quirk-mask",
+				 &quirk_mask);
+
+	if (quirk_mask & SDW_INTEL_QUIRK_MASK_BUS_DISABLE)
+		return false;
+
+	return true;
+}
+
+static int
+sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
+{
+	struct acpi_device *adev;
+	int ret, i;
+	u8 count;
+
+	if (acpi_bus_get_device(info->handle, &adev))
+		return -EINVAL;
+
+	/* Found controller, find links supported */
+	count = 0;
+	ret = fwnode_property_read_u8_array(acpi_fwnode_handle(adev),
+					    "mipi-sdw-master-count", &count, 1);
+
+	/*
+	 * In theory we could check the number of links supported in
+	 * hardware, but in that step we cannot assume SoundWire IP is
+	 * powered.
+	 *
+	 * In addition, if the BIOS doesn't even provide this
+	 * 'master-count' property then all the inits based on link
+	 * masks will fail as well.
+	 *
+	 * We will check the hardware capabilities in the startup() step
+	 */
+
+	if (ret) {
+		dev_err(&adev->dev,
+			"Failed to read mipi-sdw-master-count: %d\n", ret);
+		return -EINVAL;
+	}
+
+	/* Check count is within bounds */
+	if (count > SDW_MAX_LINKS) {
+		dev_err(&adev->dev, "Link count %d exceeds max %d\n",
+			count, SDW_MAX_LINKS);
+		return -EINVAL;
+	}
+
+	if (!count) {
+		dev_warn(&adev->dev, "No SoundWire links detected\n");
+		return -EINVAL;
+	}
+	dev_dbg(&adev->dev, "ACPI reports %d SDW Link devices\n", count);
+
+	info->count = count;
+	info->link_mask = 0;
+
+	for (i = 0; i < count; i++) {
+		if (ctrl_link_mask && !(ctrl_link_mask & BIT(i))) {
+			dev_dbg(&adev->dev,
+				"Link %d masked, will not be enabled\n", i);
+			continue;
+		}
+
+		if (!is_link_enabled(acpi_fwnode_handle(adev), i)) {
+			dev_dbg(&adev->dev,
+				"Link %d not selected in firmware\n", i);
+			continue;
+		}
+
+		info->link_mask |= BIT(i);
+	}
+
+	return 0;
+}
+
+static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level,
+				     void *cdata, void **return_value)
+{
+	struct sdw_intel_acpi_info *info = cdata;
+	struct acpi_device *adev;
+	acpi_status status;
+	u64 adr;
+
+	status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr);
+	if (ACPI_FAILURE(status))
+		return AE_OK; /* keep going */
+
+	if (acpi_bus_get_device(handle, &adev)) {
+		pr_err("%s: Couldn't find ACPI handle\n", __func__);
+		return AE_NOT_FOUND;
+	}
+
+	info->handle = handle;
+
+	/*
+	 * On some Intel platforms, multiple children of the HDAS
+	 * device can be found, but only one of them is the SoundWire
+	 * controller. The SNDW device is always exposed with
+	 * Name(_ADR, 0x40000000), with bits 31..28 representing the
+	 * SoundWire link so filter accordingly
+	 */
+	if (FIELD_GET(GENMASK(31, 28), adr) != SDW_LINK_TYPE)
+		return AE_OK; /* keep going */
+
+	/* device found, stop namespace walk */
+	return AE_CTRL_TERMINATE;
+}
+
+/**
+ * sdw_intel_acpi_scan() - SoundWire Intel init routine
+ * @parent_handle: ACPI parent handle
+ * @info: description of what firmware/DSDT tables expose
+ *
+ * This scans the namespace and queries firmware to figure out which
+ * links to enable. A follow-up use of sdw_intel_probe() and
+ * sdw_intel_startup() is required for creation of devices and bus
+ * startup
+ */
+int sdw_intel_acpi_scan(acpi_handle *parent_handle,
+			struct sdw_intel_acpi_info *info)
+{
+	acpi_status status;
+
+	info->handle = NULL;
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
+				     parent_handle, 1,
+				     sdw_intel_acpi_cb,
+				     NULL, info, NULL);
+	if (ACPI_FAILURE(status) || info->handle == NULL)
+		return -ENODEV;
+
+	return sdw_intel_scan_controller(info);
+}
+EXPORT_SYMBOL_NS(sdw_intel_acpi_scan, SND_INTEL_SOUNDWIRE_ACPI);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Soundwire ACPI helpers");
diff --git a/sound/soc/sof/intel/Kconfig b/sound/soc/sof/intel/Kconfig
index 21e24a3c64fb..da1c396f529d 100644
--- a/sound/soc/sof/intel/Kconfig
+++ b/sound/soc/sof/intel/Kconfig
@@ -286,6 +286,7 @@ config SND_SOC_SOF_INTEL_SOUNDWIRE
 	depends on ACPI && SOUNDWIRE
 	depends on !(SOUNDWIRE=m && SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE=y)
 	select SOUNDWIRE_INTEL
+	select SND_INTEL_SOUNDWIRE_ACPI
 	help
 	  This adds support for SoundWire with Sound Open Firmware
 	  for Intel(R) platforms.
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 995a8c427177..1d29b1fd6a94 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -1279,4 +1279,5 @@ MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV);
 MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC);
 MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC_I915);
 MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA);
+MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
-- 
cgit v1.2.3


From 30b5c851af7991ad08abe90c1e7c31615fa98a1a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 1 Mar 2021 12:53:09 +0000
Subject: KVM: x86/xen: Add support for vCPU runstate information

This is how Xen guests do steal time accounting. The hypervisor records
the amount of time spent in each of running/runnable/blocked/offline
states.

In the Xen accounting, a vCPU is still in state RUNSTATE_running while
in Xen for a hypercall or I/O trap, etc. Only if Xen explicitly schedules
does the state become RUNSTATE_blocked. In KVM this means that even when
the vCPU exits the kvm_run loop, the state remains RUNSTATE_running.

The VMM can explicitly set the vCPU to RUNSTATE_blocked by using the
KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT attribute, and can also use
KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST to retrospectively add a given
amount of time to the blocked state and subtract it from the running
state.

The state_entry_time corresponds to get_kvmclock_ns() at the time the
vCPU entered the current state, and the total times of all four states
should always add up to state_entry_time.

Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20210301125309.874953-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst                     |  41 +++
 arch/x86/include/asm/kvm_host.h                    |   6 +
 arch/x86/kvm/x86.c                                 |  13 +-
 arch/x86/kvm/xen.c                                 | 286 +++++++++++++++++++++
 arch/x86/kvm/xen.h                                 |  40 ++-
 include/uapi/linux/kvm.h                           |  13 +
 .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 159 +++++++++++-
 7 files changed, 553 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 359435d4e417..1a2b5210cdbf 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4878,6 +4878,14 @@ see KVM_XEN_HVM_SET_ATTR above.
 	union {
 		__u64 gpa;
 		__u64 pad[4];
+		struct {
+			__u64 state;
+			__u64 state_entry_time;
+			__u64 time_running;
+			__u64 time_runnable;
+			__u64 time_blocked;
+			__u64 time_offline;
+		} runstate;
 	} u;
   };
 
@@ -4890,6 +4898,31 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
   Sets the guest physical address of an additional pvclock structure
   for a given vCPU. This is typically used for guest vsyscall support.
 
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
+  Sets the guest physical address of the vcpu_runstate_info for a given
+  vCPU. This is how a Xen guest tracks CPU state such as steal time.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT
+  Sets the runstate (RUNSTATE_running/_runnable/_blocked/_offline) of
+  the given vCPU from the .u.runstate.state member of the structure.
+  KVM automatically accounts running and runnable time but blocked
+  and offline states are only entered explicitly.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA
+  Sets all fields of the vCPU runstate data from the .u.runstate member
+  of the structure, including the current runstate. The state_entry_time
+  must equal the sum of the other four times.
+
+KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
+  This *adds* the contents of the .u.runstate members of the structure
+  to the corresponding members of the given vCPU's runstate data, thus
+  permitting atomic adjustments to the runstate times. The adjustment
+  to the state_entry_time must equal the sum of the adjustments to the
+  other four times. The state field must be set to -1, or to a valid
+  runstate value (RUNSTATE_running, RUNSTATE_runnable, RUNSTATE_blocked
+  or RUNSTATE_offline) to set the current accounted state as of the
+  adjusted state_entry_time.
+
 4.130 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
@@ -4902,6 +4935,9 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
 Allows Xen vCPU attributes to be read. For the structure and types,
 see KVM_XEN_VCPU_SET_ATTR above.
 
+The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used
+with the KVM_XEN_VCPU_GET_ATTR ioctl.
+
 5. The kvm_run structure
 ========================
 
@@ -6666,6 +6702,7 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
+  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -6680,3 +6717,7 @@ KVM_XEN_HVM_SET_ATTR, KVM_XEN_HVM_GET_ATTR, KVM_XEN_VCPU_SET_ATTR and
 KVM_XEN_VCPU_GET_ATTR ioctls, as well as the delivery of exception vectors
 for event channel upcalls when the evtchn_upcall_pending field of a vcpu's
 vcpu_info is set.
+
+The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
+features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
+supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 85ccbd4b7c52..877a4025d8da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -535,10 +535,16 @@ struct kvm_vcpu_hv {
 /* Xen HVM per vcpu emulation context */
 struct kvm_vcpu_xen {
 	u64 hypercall_rip;
+	u32 current_runstate;
 	bool vcpu_info_set;
 	bool vcpu_time_info_set;
+	bool runstate_set;
 	struct gfn_to_hva_cache vcpu_info_cache;
 	struct gfn_to_hva_cache vcpu_time_info_cache;
+	struct gfn_to_hva_cache runstate_cache;
+	u64 last_steal;
+	u64 runstate_entry_time;
+	u64 runstate_times[4];
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4a5ce57b0bb2..868213ca4f98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2956,6 +2956,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	struct kvm_host_map map;
 	struct kvm_steal_time *st;
 
+	if (kvm_xen_msr_enabled(vcpu->kvm)) {
+		kvm_xen_runstate_set_running(vcpu);
+		return;
+	}
+
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
@@ -3760,6 +3765,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO;
+		if (sched_info_on())
+			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
 #endif
 	case KVM_CAP_SYNC_REGS:
@@ -4039,7 +4046,11 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->preempted && !vcpu->arch.guest_state_protected)
 		vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
 
-	kvm_steal_time_set_preempted(vcpu);
+	if (kvm_xen_msr_enabled(vcpu->kvm))
+		kvm_xen_runstate_set_preempted(vcpu);
+	else
+		kvm_steal_time_set_preempted(vcpu);
+
 	static_call(kvm_x86_vcpu_put)(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
 	/*
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 77b20ff09078..ae17250e1efe 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -11,9 +11,11 @@
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
+#include <linux/sched/stat.h>
 
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
 
 #include "trace.h"
 
@@ -61,6 +63,132 @@ out:
 	return ret;
 }
 
+static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	u64 now = get_kvmclock_ns(v->kvm);
+	u64 delta_ns = now - vx->runstate_entry_time;
+	u64 run_delay = current->sched_info.run_delay;
+
+	if (unlikely(!vx->runstate_entry_time))
+		vx->current_runstate = RUNSTATE_offline;
+
+	/*
+	 * Time waiting for the scheduler isn't "stolen" if the
+	 * vCPU wasn't running anyway.
+	 */
+	if (vx->current_runstate == RUNSTATE_running) {
+		u64 steal_ns = run_delay - vx->last_steal;
+
+		delta_ns -= steal_ns;
+
+		vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+	}
+	vx->last_steal = run_delay;
+
+	vx->runstate_times[vx->current_runstate] += delta_ns;
+	vx->current_runstate = state;
+	vx->runstate_entry_time = now;
+}
+
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
+{
+	struct kvm_vcpu_xen *vx = &v->arch.xen;
+	uint64_t state_entry_time;
+	unsigned int offset;
+
+	kvm_xen_update_runstate(v, state);
+
+	if (!vx->runstate_set)
+		return;
+
+	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+	offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
+#ifdef CONFIG_X86_64
+	/*
+	 * The only difference is alignment of uint64_t in 32-bit.
+	 * So the first field 'state' is accessed directly using
+	 * offsetof() (where its offset happens to be zero), while the
+	 * remaining fields which are all uint64_t, start at 'offset'
+	 * which we tweak here by adding 4.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
+		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
+
+	if (v->kvm->arch.xen.long_mode)
+		offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+#endif
+	/*
+	 * First write the updated state_entry_time at the appropriate
+	 * location determined by 'offset'.
+	 */
+	state_entry_time = vx->runstate_entry_time;
+	state_entry_time |= XEN_RUNSTATE_UPDATE;
+
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
+		     sizeof(state_entry_time));
+	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
+		     sizeof(state_entry_time));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &state_entry_time, offset,
+					  sizeof(state_entry_time)))
+		return;
+	smp_wmb();
+
+	/*
+	 * Next, write the new runstate. This is in the *same* place
+	 * for 32-bit and 64-bit guests, asserted here for paranoia.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+		     offsetof(struct compat_vcpu_runstate_info, state));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+		     sizeof(vx->current_runstate));
+	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+		     sizeof(vx->current_runstate));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &vx->current_runstate,
+					  offsetof(struct vcpu_runstate_info, state),
+					  sizeof(vx->current_runstate)))
+		return;
+
+	/*
+	 * Write the actual runstate times immediately after the
+	 * runstate_entry_time.
+	 */
+	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+		     sizeof(((struct compat_vcpu_runstate_info *)0)->time));
+	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+		     sizeof(vx->runstate_times));
+
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &vx->runstate_times[0],
+					  offset + sizeof(u64),
+					  sizeof(vx->runstate_times)))
+		return;
+
+	smp_wmb();
+
+	/*
+	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
+	 * runstate_entry_time field.
+	 */
+
+	state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+					  &state_entry_time, offset,
+					  sizeof(state_entry_time)))
+		return;
+}
+
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
 	u8 rc = 0;
@@ -223,6 +351,121 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		}
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.gpa == GPA_INVALID) {
+			vcpu->arch.xen.runstate_set = false;
+			r = 0;
+			break;
+		}
+
+		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+					      &vcpu->arch.xen.runstate_cache,
+					      data->u.gpa,
+					      sizeof(struct vcpu_runstate_info));
+		if (!r) {
+			vcpu->arch.xen.runstate_set = true;
+		}
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline) {
+			r = -EINVAL;
+			break;
+		}
+
+		kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline) {
+			r = -EINVAL;
+			break;
+		}
+		if (data->u.runstate.state_entry_time !=
+		    (data->u.runstate.time_running +
+		     data->u.runstate.time_runnable +
+		     data->u.runstate.time_blocked +
+		     data->u.runstate.time_offline)) {
+			r = -EINVAL;
+			break;
+		}
+		if (get_kvmclock_ns(vcpu->kvm) <
+		    data->u.runstate.state_entry_time) {
+			r = -EINVAL;
+			break;
+		}
+
+		vcpu->arch.xen.current_runstate = data->u.runstate.state;
+		vcpu->arch.xen.runstate_entry_time =
+			data->u.runstate.state_entry_time;
+		vcpu->arch.xen.runstate_times[RUNSTATE_running] =
+			data->u.runstate.time_running;
+		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
+			data->u.runstate.time_runnable;
+		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
+			data->u.runstate.time_blocked;
+		vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
+			data->u.runstate.time_offline;
+		vcpu->arch.xen.last_steal = current->sched_info.run_delay;
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (data->u.runstate.state > RUNSTATE_offline &&
+		    data->u.runstate.state != (u64)-1) {
+			r = -EINVAL;
+			break;
+		}
+		/* The adjustment must add up */
+		if (data->u.runstate.state_entry_time !=
+		    (data->u.runstate.time_running +
+		     data->u.runstate.time_runnable +
+		     data->u.runstate.time_blocked +
+		     data->u.runstate.time_offline)) {
+			r = -EINVAL;
+			break;
+		}
+
+		if (get_kvmclock_ns(vcpu->kvm) <
+		    (vcpu->arch.xen.runstate_entry_time +
+		     data->u.runstate.state_entry_time)) {
+			r = -EINVAL;
+			break;
+		}
+
+		vcpu->arch.xen.runstate_entry_time +=
+			data->u.runstate.state_entry_time;
+		vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
+			data->u.runstate.time_running;
+		vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
+			data->u.runstate.time_runnable;
+		vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
+			data->u.runstate.time_blocked;
+		vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
+			data->u.runstate.time_offline;
+
+		if (data->u.runstate.state <= RUNSTATE_offline)
+			kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -255,6 +498,49 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		if (vcpu->arch.xen.runstate_set) {
+			data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
+			r = 0;
+		}
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		data->u.runstate.state = vcpu->arch.xen.current_runstate;
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+		if (!sched_info_on()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+		data->u.runstate.state = vcpu->arch.xen.current_runstate;
+		data->u.runstate.state_entry_time =
+			vcpu->arch.xen.runstate_entry_time;
+		data->u.runstate.time_running =
+			vcpu->arch.xen.runstate_times[RUNSTATE_running];
+		data->u.runstate.time_runnable =
+			vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
+		data->u.runstate.time_blocked =
+			vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
+		data->u.runstate.time_offline =
+			vcpu->arch.xen.runstate_times[RUNSTATE_offline];
+		r = 0;
+		break;
+
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+		r = -EINVAL;
+		break;
+
 	default:
 		break;
 	}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 87eaf2be9549..463a7844a8ca 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -23,6 +23,12 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+	return static_branch_unlikely(&kvm_xen_enabled.key) &&
+		kvm->arch.xen_hvm_config.msr;
+}
+
 static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -48,6 +54,11 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
 {
 }
 
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+	return false;
+}
+
 static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 {
 	return false;
@@ -61,10 +72,31 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
 
-/* 32-bit compatibility definitions, also used natively in 32-bit build */
 #include <asm/pvclock-abi.h>
 #include <asm/xen/interface.h>
+#include <xen/interface/vcpu.h>
 
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+
+static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
+{
+	kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+}
+
+static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * If the vCPU wasn't preempted but took a normal exit for
+	 * some reason (hypercalls, I/O, etc.), that is accounted as
+	 * still RUNSTATE_running, as the VMM is still operating on
+	 * behalf of the vCPU. Only if the VMM does actually block
+	 * does it need to enter RUNSTATE_blocked.
+	 */
+	if (vcpu->preempted)
+		kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
 struct compat_arch_vcpu_info {
 	unsigned int cr2;
 	unsigned int pad[5];
@@ -97,4 +129,10 @@ struct compat_shared_info {
 	struct compat_arch_shared_info arch;
 };
 
+struct compat_vcpu_runstate_info {
+    int state;
+    uint64_t state_entry_time;
+    uint64_t time[4];
+} __attribute__((packed));
+
 #endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8b281f722e5b..f6afee209620 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1154,6 +1154,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
@@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr {
 	union {
 		__u64 gpa;
 		__u64 pad[8];
+		struct {
+			__u64 state;
+			__u64 state_entry_time;
+			__u64 time_running;
+			__u64 time_runnable;
+			__u64 time_blocked;
+			__u64 time_offline;
+		} runstate;
 	} u;
 };
 
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO	0x0
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO	0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR	0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 9246ea310587..804ff5ff022d 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -13,19 +13,27 @@
 
 #include <stdint.h>
 #include <time.h>
+#include <sched.h>
+#include <sys/syscall.h>
 
 #define VCPU_ID		5
 
+#define SHINFO_REGION_GVA	0xc0000000ULL
 #define SHINFO_REGION_GPA	0xc0000000ULL
 #define SHINFO_REGION_SLOT	10
 #define PAGE_SIZE		4096
 
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
+#define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
+
+#define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
 
 static struct kvm_vm *vm;
 
 #define XEN_HYPERCALL_MSR	0x40000000
 
+#define MIN_STEAL_TIME		50000
+
 struct pvclock_vcpu_time_info {
         u32   version;
         u32   pad0;
@@ -43,11 +51,67 @@ struct pvclock_wall_clock {
         u32   nsec;
 } __attribute__((__packed__));
 
+struct vcpu_runstate_info {
+    uint32_t state;
+    uint64_t state_entry_time;
+    uint64_t time[4];
+};
+
+#define RUNSTATE_running  0
+#define RUNSTATE_runnable 1
+#define RUNSTATE_blocked  2
+#define RUNSTATE_offline  3
+
 static void guest_code(void)
 {
+	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+
+	/* Test having the host set runstates manually */
+	GUEST_SYNC(RUNSTATE_runnable);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(RUNSTATE_blocked);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(RUNSTATE_offline);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	/* Test runstate time adjust */
+	GUEST_SYNC(4);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
+
+	/* Test runstate time set */
+	GUEST_SYNC(5);
+	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
+
+	/* sched_yield() should result in some 'runnable' time */
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
+
 	GUEST_DONE();
 }
 
+static long get_run_delay(void)
+{
+        char path[64];
+        long val[2];
+        FILE *fp;
+
+        sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid));
+        fp = fopen(path, "r");
+        fscanf(fp, "%ld %ld ", &val[0], &val[1]);
+        fclose(fp);
+
+        return val[1];
+}
+
 static int cmp_timespec(struct timespec *a, struct timespec *b)
 {
 	if (a->tv_sec > b->tv_sec)
@@ -66,12 +130,14 @@ int main(int argc, char *argv[])
 {
 	struct timespec min_ts, max_ts, vm_ts;
 
-	if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
-	      KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
 		print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available");
 		exit(KSFT_SKIP);
 	}
 
+	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
 	vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
@@ -80,6 +146,7 @@ int main(int argc, char *argv[])
 	/* Map a region for the shared_info page */
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
+	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2, 0);
 
 	struct kvm_xen_hvm_config hvmc = {
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
@@ -111,6 +178,17 @@ int main(int argc, char *argv[])
 	};
 	vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock);
 
+	if (do_runstate_tests) {
+		struct kvm_xen_vcpu_attr st = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+			.u.gpa = RUNSTATE_ADDR,
+		};
+		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
+	}
+
+	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+	rs->state = 0x5a;
+
 	for (;;) {
 		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct ucall uc;
@@ -126,8 +204,56 @@ int main(int argc, char *argv[])
 		case UCALL_ABORT:
 			TEST_FAIL("%s", (const char *)uc.args[0]);
 			/* NOT REACHED */
-		case UCALL_SYNC:
+		case UCALL_SYNC: {
+			struct kvm_xen_vcpu_attr rst;
+			long rundelay;
+
+			/* If no runstate support, bail out early */
+			if (!do_runstate_tests)
+				goto done;
+
+			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+				    rs->time[1] + rs->time[2] + rs->time[3],
+				    "runstate times don't add up");
+
+			switch (uc.args[1]) {
+			case RUNSTATE_running...RUNSTATE_offline:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
+				rst.u.runstate.state = uc.args[1];
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+			case 4:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = (uint64_t)-1;
+				rst.u.runstate.time_blocked =
+					0x5a - rs->time[RUNSTATE_blocked];
+				rst.u.runstate.time_offline =
+					0x6b6b - rs->time[RUNSTATE_offline];
+				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
+					rst.u.runstate.time_offline;
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case 5:
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = RUNSTATE_running;
+				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
+				rst.u.runstate.time_blocked = 0x6b6b;
+				rst.u.runstate.time_offline = 0x5a;
+				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+			case 6:
+				/* Yield until scheduler delay exceeds target */
+				rundelay = get_run_delay() + MIN_STEAL_TIME;
+				do {
+					sched_yield();
+				} while (get_run_delay() < rundelay);
+				break;
+			}
 			break;
+		}
 		case UCALL_DONE:
 			goto done;
 		default:
@@ -162,6 +288,33 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(ti2->version && !(ti2->version & 1),
 		    "Bad time_info version %x", ti->version);
 
+	if (do_runstate_tests) {
+		/*
+		 * Fetch runstate and check sanity. Strictly speaking in the
+		 * general case we might not expect the numbers to be identical
+		 * but in this case we know we aren't running the vCPU any more.
+		 */
+		struct kvm_xen_vcpu_attr rst = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
+		};
+		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst);
+
+		TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
+		TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
+			    "State entry time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+			    "Running time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+			    "Runnable time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+			    "Blocked time mismatch");
+		TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+			    "Offline time mismatch");
+
+		TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+			    rs->time[1] + rs->time[2] + rs->time[3],
+			    "runstate times don't add up");
+	}
 	kvm_vm_free(vm);
 	return 0;
 }
-- 
cgit v1.2.3


From caf6912f3f4af7232340d500a4a2008f81b93f14 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 2 Mar 2021 14:53:21 -0700
Subject: swap: fix swapfile read/write offset

We're not factoring in the start of the file for where to write and
read the swapfile, which leads to very unfortunate side effects of
writing where we should not be...

Fixes: 48d15436fde6 ("mm: remove get_swap_bio")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/swap.h |  1 +
 mm/page_io.c         |  5 -----
 mm/swapfile.c        | 13 +++++++++++++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32f665b1ee85..4cc6ec3bf0ab 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -485,6 +485,7 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 485fa5cca4a2..c493ce9ebcf5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -254,11 +254,6 @@ out:
 	return ret;
 }
 
-static sector_t swap_page_sector(struct page *page)
-{
-	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
-}
-
 static inline void count_swpout_vm_event(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f039745989d2..084a5b9a18e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -219,6 +219,19 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
 	BUG();
 }
 
+sector_t swap_page_sector(struct page *page)
+{
+	struct swap_info_struct *sis = page_swap_info(page);
+	struct swap_extent *se;
+	sector_t sector;
+	pgoff_t offset;
+
+	offset = __page_file_index(page);
+	se = offset_to_swap_extent(sis, offset);
+	sector = se->start_block + (offset - se->start_page);
+	return sector << (PAGE_SHIFT - 9);
+}
+
 /*
  * swap allocation tell device that a cluster of swap can now be discarded,
  * to allow the swap device to optimize its wear-levelling.
-- 
cgit v1.2.3


From ff70784ab9f89e78e67d5d172bf7644de673f61f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 2 Mar 2021 15:35:48 +0200
Subject: ACPI: bus: Constify is_acpi_node() and friends (part 2)

Commit 8b9d6802583a ("ACPI: Constify acpi_bus helper functions,
switch to macros") only changed functions for CONFIG_ACPI=y case.
This part adjusts the rest.

Fixes: 8b9d6802583a ("ACPI: Constify acpi_bus helper functions, switch to macros")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 3c5757d539ab..9f432411e988 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -746,12 +746,12 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 
 static inline void acpi_dev_put(struct acpi_device *adev) {}
 
-static inline bool is_acpi_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
 
-static inline bool is_acpi_device_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
@@ -761,7 +761,7 @@ static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwno
 	return NULL;
 }
 
-static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 8452d4a674b0e59bd53baef0b30b018690dde594 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Feb 2021 11:16:46 +0000
Subject: io_uring: destroy io-wq on exec

Destroy current's io-wq backend and tctx on __io_uring_task_cancel(),
aka exec(). Looks it's not strictly necessary, because it will be done
at some point when the task dies and changes of creds/files/etc. are
handled, but better to do that earlier to free io-wq and not potentially
lock previous mm and other resources for the time being.

It's safe to do because we wait for all requests of the current task to
complete, so no request will use tctx afterwards. Note, that
io_uring_files_cancel() may leave some requests for later reaping, so it
leaves tctx intact, that's ok as the task is dying anyway.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 19 ++++++++++---------
 include/linux/io_uring.h |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 796b6d1f72f9..73d1bd8db1bb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8835,13 +8835,17 @@ static void io_uring_del_task_file(struct file *file)
 		fput(file);
 }
 
-static void io_uring_remove_task_files(struct io_uring_task *tctx)
+static void io_uring_clean_tctx(struct io_uring_task *tctx)
 {
 	struct file *file;
 	unsigned long index;
 
 	xa_for_each(&tctx->xa, index, file)
 		io_uring_del_task_file(file);
+	if (tctx->io_wq) {
+		io_wq_put_and_exit(tctx->io_wq);
+		tctx->io_wq = NULL;
+	}
 }
 
 void __io_uring_files_cancel(struct files_struct *files)
@@ -8856,13 +8860,8 @@ void __io_uring_files_cancel(struct files_struct *files)
 		io_uring_cancel_task_requests(file->private_data, files);
 	atomic_dec(&tctx->in_idle);
 
-	if (files) {
-		io_uring_remove_task_files(tctx);
-		if (tctx->io_wq) {
-			io_wq_put_and_exit(tctx->io_wq);
-			tctx->io_wq = NULL;
-		}
-	}
+	if (files)
+		io_uring_clean_tctx(tctx);
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -8954,7 +8953,9 @@ void __io_uring_task_cancel(void)
 
 	atomic_dec(&tctx->in_idle);
 
-	io_uring_remove_task_files(tctx);
+	io_uring_clean_tctx(tctx);
+	/* all current's requests should be gone, we can kill tctx */
+	__io_uring_free(current);
 }
 
 static int io_uring_flush(struct file *file, void *data)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 51ede771cd99..7cb7bd0e334c 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -38,7 +38,7 @@ void __io_uring_free(struct task_struct *tsk);
 
 static inline void io_uring_task_cancel(void)
 {
-	if (current->io_uring && !xa_empty(&current->io_uring->xa))
+	if (current->io_uring)
 		__io_uring_task_cancel();
 }
 static inline void io_uring_files_cancel(struct files_struct *files)
-- 
cgit v1.2.3


From f9f344479d8b40b3b001c913fb992d85d19261d0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 26 Feb 2021 14:09:15 -0500
Subject: tracing: Fix comment about the trace_event_call flags

In the declaration of the struct trace_event_call, the flags has the bits
defined in the comment above it. But these bits are also defined by the
TRACE_EVENT_FL_* enums just above the declaration of the struct. As the
comment about the flags in the struct has become stale and incorrect, just
replace it with a reference to the TRACE_EVENT_FL_* enum above.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 7077fec653bb..28e7af1406f2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -349,15 +349,8 @@ struct trace_event_call {
 	struct event_filter	*filter;
 	void			*mod;
 	void			*data;
-	/*
-	 *   bit 0:		filter_active
-	 *   bit 1:		allow trace by non root (cap any)
-	 *   bit 2:		failed to apply filter
-	 *   bit 3:		trace internal event (do not enable)
-	 *   bit 4:		Event was enabled by module
-	 *   bit 5:		use call filter rather than file filter
-	 *   bit 6:		Event is a tracepoint
-	 */
+
+	/* See the TRACE_EVENT_FL_* flags above */
 	int			flags; /* static flags of different events */
 
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From cc440e8738e5c875297ac0e90316745093be7e28 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Mar 2021 12:21:05 -0700
Subject: kernel: provide create_io_thread() helper

Provide a generic helper for setting up an io_uring worker. Returns a
task_struct so that the caller can do whatever setup is needed, then call
wake_up_new_task() to kick it into gear.

Add a kernel_clone_args member, io_thread, which tells copy_process() to
mark the task with PF_IO_WORKER.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched/task.h |  2 ++
 kernel/fork.c              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c0f71f2e7160..ef02be869cf2 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -31,6 +31,7 @@ struct kernel_clone_args {
 	/* Number of elements in *set_tid */
 	size_t set_tid_size;
 	int cgroup;
+	int io_thread;
 	struct cgroup *cgrp;
 	struct css_set *cset;
 };
@@ -82,6 +83,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..d3171e8e88e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1940,6 +1940,8 @@ static __latent_entropy struct task_struct *copy_process(
 	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
+	if (args->io_thread)
+		p->flags |= PF_IO_WORKER;
 
 	/*
 	 * This _must_ happen before we call free_task(), i.e. before we jump
@@ -2410,6 +2412,34 @@ struct mm_struct *copy_init_mm(void)
 	return dup_mm(NULL, &init_mm);
 }
 
+/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+				CLONE_IO;
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+		.io_thread	= 1,
+	};
+	struct task_struct *tsk;
+
+	tsk = copy_process(NULL, 0, node, &args);
+	if (!IS_ERR(tsk)) {
+		sigfillset(&tsk->blocked);
+		sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
+	}
+	return tsk;
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
cgit v1.2.3